""" pyeo.classification =================== Contains every function to do with map classification. This includes model creation, map classification and processes for array manipulation into scikit-learn compatible forms. """ import csv import glob import logging import os from tempfile import TemporaryDirectory import gdal import joblib import numpy as np from osgeo import osr from scipy import sparse as sp from sklearn import ensemble as ens from sklearn.externals import joblib as sklearn_joblib from sklearn.model_selection import cross_val_score from pyeo.coordinate_manipulation import get_local_top_left from pyeo.filesystem_utilities import get_mask_path from pyeo.raster_manipulation import stack_images, create_matching_dataset, apply_array_image_mask, get_masked_array import pyeo.windows_compatability log = logging.getLogger(__name__) def change_from_composite(image_path, composite_path, model_path, class_out_path, prob_out_path=None): """ Stacks an image with a composite and classifies each pixel change with a scikit-learn model The image that is classified is has the following bands 1. composite blue 2. composite green 3. composite red 4. composite IR 5. image blue 6. image green 7. image red 8. image IR Parameters ---------- image_path The path to the image composite_path The path to the composite model_path The path to a .pkl of a scikit-learn classifier that takes 8 features class_out_path A location to save the resulting classification .tif prob_out_path A location to save the probability raster of each pixel """ with TemporaryDirectory() as td: stacked_path = os.path.join(td, "comp_stack.tif") stack_images((composite_path, image_path), stacked_path) classify_image(stacked_path, model_path, class_out_path, prob_out_path) def classify_image(image_path, model_path, class_out_path, prob_out_path=None, apply_mask=False, out_type="GTiff", num_chunks=10, nodata=0, skip_existing = False): """ Produces a class map from a raster and a model. This applies the model's fit() function to each pixel in the input raster, and saves the result into an output raster. The model is presumed to be a scikit-learn fitted model created using one of the other functions in this library (create_model_from_rasters, create_model_from_signatures). To fit into a Parameters ---------- image_path The path to the raster image to be classified. model_path The path to the .pkl file containing the model class_out_path The path that the classified map will be saved at. prob_out_path If present, the path that the class probability map will be stored at. apply_mask If True, uses the .msk file corresponding to the image at image_path to skip any invalid pixels. out_type The raster format of the class image. Defaults to GTiff (geotif) num_chunks The number of chunks the image is broken into prior to classification. The smaller this number, the faster classification will run - but the more likely you are to get a outofmemory error. nodata The value to write to masked pixels skip_existing If true, do not run if class_out_path already exists Notes ----- If you want to create a custom model, the object is presumed to have the following methods and attributes: - model.n_classes_ : the number of classes the model will produce - model.n_cores : The number of CPU cores used to run the model - model.predict() : A function that will take a set of band inputs from a pixel and produce a class. - model.predict_proba() : If called with prob_out_path, a function that takes a set of n band inputs from a pixel and produces n_classes_ outputs corresponding to the probabilties of a given pixel being that class """ if skip_existing: log.info("Checking for existing classification {}".format(class_out_path)) if os.path.isfile(class_out_path): log.info("Class image exists, skipping.") return class_out_path log.info("Classifying file: {}".format(image_path)) log.info("Saved model : {}".format(model_path)) image = gdal.Open(image_path) if num_chunks == None: log.info("No chunk size given, attempting autochunk.") num_chunks = autochunk(image) log.info("Autochunk to {} chunks".format(num_chunks)) try: model = sklearn_joblib.load(model_path) except KeyError: log.warning("Sklearn joblib import failed,trying generic joblib") model = joblib.load(model_path) except TypeError: log.warning("Sklearn joblib import failed,trying generic joblib") model = joblib.load(model_path) class_out_image = create_matching_dataset(image, class_out_path, format=out_type, datatype=gdal.GDT_Byte) log.info("Created classification image file: {}".format(class_out_path)) if prob_out_path: try: log.info("n classes in the model: {}".format(model.n_classes_)) except AttributeError: log.warning("Model has no n_classes_ attribute (known issue with GridSearch)") prob_out_image = create_matching_dataset(image, prob_out_path, bands=model.n_classes_, datatype=gdal.GDT_Float32) log.info("Created probability image file: {}".format(prob_out_path)) model.n_cores = -1 image_array = image.GetVirtualMemArray() if apply_mask: mask_path = get_mask_path(image_path) log.info("Applying mask at {}".format(mask_path)) mask = gdal.Open(mask_path) mask_array = mask.GetVirtualMemArray() image_array = apply_array_image_mask(image_array, mask_array) mask_array = None mask = None # Mask out missing values from the classification # at this point, image_array has dimensions [band, y, x] log.info("Reshaping image from GDAL to Scikit-Learn dimensions") image_array = reshape_raster_for_ml(image_array) # Now it has dimensions [x * y, band] as needed for Scikit-Learn # Determine where in the image array there are no missing values in any of the bands (axis 1) log.info("Finding good pixels without missing values") log.info("image_array.shape = {}".format(image_array.shape)) n_samples = image_array.shape[0] # gives x * y dimension of the whole image good_mask = np.all(image_array != nodata, axis=1) good_sample_count = np.count_nonzero(good_mask) log.info("No. good values: {}".format(good_sample_count)) #if good_sample_count <= 0.5*len(good_mask): # If the images is less than 50% good pixels, do filtering if 1 == 0: # Removing the filter until we fix the classification issue with it log.info("Filtering nodata values") good_indices = np.nonzero(good_mask) good_samples = np.take(image_array, good_indices, axis=0).squeeze() n_good_samples = len(good_samples) else: #log.info("Not worth filtering nodata, skipping.") good_samples = image_array good_indices = range(0, n_samples) n_good_samples = n_samples log.info(" All samples: {}".format(n_samples)) log.info(" Good samples: {}".format(n_good_samples)) classes = np.full(n_good_samples, nodata, dtype=np.ubyte) if prob_out_path: probs = np.full((n_good_samples, model.n_classes_), nodata, dtype=np.float32) chunk_size = int(n_good_samples / num_chunks) chunk_resid = n_good_samples - (chunk_size * num_chunks) log.info(" Number of chunks {} Chunk size {} Chunk residual {}".format(num_chunks, chunk_size, chunk_resid)) # The chunks iterate over all values in the array [x * y, bands] always with 8 bands per chunk for chunk_id in range(num_chunks): offset = chunk_id * chunk_size # process the residual pixels with the last chunk if chunk_id == num_chunks - 1: chunk_size = chunk_size + chunk_resid log.info(" Classifying chunk {} of size {}".format(chunk_id, chunk_size)) chunk_view = good_samples[offset : offset + chunk_size] #indices_view = good_indices[offset : offset + chunk_size] out_view = classes[offset : offset + chunk_size] # dimensions [chunk_size] out_view[:] = model.predict(chunk_view) if prob_out_path: log.info(" Calculating probabilities") prob_view = probs[offset : offset + chunk_size, :] prob_view[:, :] = model.predict_proba(chunk_view) log.info(" Creating class array of size {}".format(n_samples)) class_out_array = np.full((n_samples), nodata) for i, class_val in zip(good_indices, classes): class_out_array[i] = class_val log.info(" Creating GDAL class image") class_out_image.GetVirtualMemArray(eAccess=gdal.GF_Write)[:, :] = \ reshape_ml_out_to_raster(class_out_array, image.RasterXSize, image.RasterYSize) if prob_out_path: log.info(" Creating probability array of size {}".format(n_samples * model.n_classes_)) prob_out_array = np.full((n_samples, model.n_classes_), nodata) for i, prob_val in zip(good_indices, probs): prob_out_array[i] = prob_val log.info(" Creating GDAL probability image") log.info(" N Classes = {}".format(prob_out_array.shape[1])) log.info(" Image X size = {}".format(image.RasterXSize)) log.info(" Image Y size = {}".format(image.RasterYSize)) prob_out_image.GetVirtualMemArray(eAccess=gdal.GF_Write)[:, :, :] = \ reshape_prob_out_to_raster(prob_out_array, image.RasterXSize, image.RasterYSize) class_out_image = None prob_out_image = None if prob_out_path: return class_out_path, prob_out_path else: return class_out_path def autochunk(dataset, mem_limit=None): """ EXPERIMENTAL Calculates the number of chunks to break a dataset into without a memory error. Presumes that 80% of the memory on the host machine is available for use by Pyeo. We want to break the dataset into as few chunks as possible without going over mem_limit. mem_limit defaults to total amount of RAM available on machine if not specified Parameters ---------- dataset The dataset to chunk mem_limit The maximum amount of memory available to the process. Will be automatically populated from os.sysconf if missing. Returns ------- The number of chunks to most efficiently break the image into. """ pixels = dataset.RasterXSize * dataset.RasterYSize bytes_per_pixel = dataset.GetVirtualMemArray().dtype.itemsize*dataset.RasterCount image_bytes = bytes_per_pixel*pixels if not mem_limit: mem_limit = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_AVPHYS_PAGES') # Lets assume that 20% of memory is being used for non-map bits mem_limit = int(mem_limit*0.8) # if I went back now, I would fail basic programming here. for num_chunks in range(1, pixels): if pixels % num_chunks != 0: continue chunk_size_bytes = (pixels/num_chunks)*bytes_per_pixel if chunk_size_bytes < mem_limit: return num_chunks def classify_directory(in_dir, model_path, class_out_dir, prob_out_dir = None, apply_mask=False, out_type="GTiff", num_chunks=10): """ Classifies every file ending in .tif in in_dir using model at model_path. Outputs are saved in class_out_dir and prob_out_dir, named [input_name]_class and _prob, respectively. See the documentation for classification.classify_image() for more details. Parameters ---------- in_dir The path to the directory containing the rasters to be classified. model_path The path to the .pkl file containing the model. class_out_dir The directory that will store the classified maps prob_out_dir The directory that will store the probability maps of the classified maps apply_mask If present, uses the corresponding .msk files to mask the directories out_type The raster format of the class image. Defaults to GTiff (geotif) num_chunks The number of chunks to break an image into. """ log = logging.getLogger(__name__) log.info("Classifying files in {}".format(in_dir)) log.info("Class files saved in {}".format(class_out_dir)) log.info("Prob. files saved in {}".format(prob_out_dir)) for image_path in glob.glob(in_dir+r"/*.tif"): image_name = os.path.basename(image_path).split('.')[0] class_out_path = os.path.join(class_out_dir, image_name+"_class.tif") if prob_out_dir: prob_out_path = os.path.join(prob_out_dir, image_name+"_prob.tif") else: prob_out_path = None classify_image(image_path, model_path, class_out_path, prob_out_path, apply_mask, out_type, num_chunks) def reshape_raster_for_ml(image_array): """ A low-level function that reshapes an array from gdal order [band, y, x] to scikit features order [x*y, band] For classification, scikit-learn functions take a 2-dimensional array of features of the shape (samples, features). For pixel classification, features correspond to bands and samples correspond to specific pixels. Parameters ---------- image_array A 3-dimensional Numpy array of shape (bands, y, x) Returns ------- A 2-dimensional Numpy array of shape (samples, features) """ bands, y, x = image_array.shape image_array = np.transpose(image_array, (1, 2, 0)) image_array = np.reshape(image_array, (x * y, bands)) return image_array def reshape_ml_out_to_raster(classes, width, height): """ Takes the output of a pixel classifier and reshapes to a single band image. Parameters ---------- classes A 1-d numpy array of classes from a pixel classifier width The width in pixels of the image the produced the classification height The height in pixels of the image that produced the classification Returns ------- A 2-dimensional Numpy array of shape(width, height) """ # TODO: Test this. image_array = np.reshape(classes, (height, width)) return image_array def reshape_prob_out_to_raster(probs, width, height): """ Takes the probability output of a pixel classifier and reshapes it to a raster. Parameters ---------- probs A numpy array of shape(n_pixels, n_classes) width The width in pixels of the image that produced the probability classification height The height in pixels of the image that produced the probability classification Returns ------- The reshaped image array """ classes = probs.shape[1] image_array = np.transpose(probs, (1, 0)) image_array = np.reshape(image_array, (classes, height, width)) return image_array def extract_features_to_csv(in_ras_path, training_shape_path, out_path, attribute="CODE"): """ Given a raster and a shapefile containing training polygons, extracts all pixels into a CSV file for further analysis. This produces a CSV file where each row corresponds to a pixel. The columns are as follows: Column 1: Class labels from the shapefile field labelled as 'attribute'. Column 2... : Band values from the raster at in_ras_path. Parameters ---------- in_ras_path The path to the raster used for creating the training dataset training_shape_path The path to the shapefile containing classification polygons out_path The path for the new .csv file attribute The label of the field in the training shapefile that contains the classification labels. """ this_training_data, this_classes = get_training_data(in_ras_path, training_shape_path, attribute=attribute) sigs = np.vstack((this_classes, this_training_data.T)) with open(out_path, 'w', newline='') as outfile: writer = csv.writer(outfile) writer.writerows(sigs.T) def create_trained_model(training_image_file_paths, cross_val_repeats = 5, attribute="CODE"): """ Creates a trained model from a set of training images with associated shapefiles. This assumes that each image in training_image_file_paths has in the same directory a folder of the same name containing a shapefile of the same name. For example, in the folder training_data: training_data - area1.tif - area1 - area1.shp - area1.dbx ... rest of shapefile for area 1 ... - area2.tif - area2 - area2.shp - area2.dbx ... rest of shapefile for area 2 ... Parameters ---------- training_image_file_paths A list of filepaths to training images. cross_val_repeats The number of cross-validation repeats to use attribute The label of the field in the training shapefiles that contains the classification labels. Returns ------- model A fitted scikit-learn model. See notes. scores The cross-validation scores for model Notes ---- For full details of how to create an appropriate shapefile, see [here](../index.html#training_data). At present, the model is an ExtraTreesClassifier arrived at by tpot: model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced') """ # This could be optimised by pre-allocating the training array. but not now. learning_data = None classes = None for training_image_file_path in training_image_file_paths: training_image_folder, training_image_name = os.path.split(training_image_file_path) training_image_name = training_image_name[:-4] # Strip the file extension shape_path = os.path.join(training_image_folder, training_image_name, training_image_name + '.shp') this_training_data, this_classes = get_training_data(training_image_file_path, shape_path, attribute) if learning_data is None: learning_data = this_training_data classes = this_classes else: learning_data = np.append(learning_data, this_training_data, 0) classes = np.append(classes, this_classes) model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced') model.fit(learning_data, classes) scores = cross_val_score(model, learning_data, classes, cv=cross_val_repeats) return model, scores def create_model_for_region(path_to_region, model_out, scores_out, attribute="CODE"): """ Takes all .tif files in a given folder and creates a pickled scikit-learn model for classifying them. Wraps classification.create_trained_model() ; see docs for that for the details. Parameters ---------- path_to_region Path to the folder containing the tifs. model_out Path to location to save the .pkl file scores_out Path to save the cross-validation scores attribute The label of the field in the training shapefiles that contains the classification labels. """ image_glob = os.path.join(path_to_region, r"*.tif") image_list = glob.glob(image_glob) model, scores = create_trained_model(image_list, attribute=attribute) joblib.dump(model, model_out) with open(scores_out, 'w') as score_file: score_file.write(str(scores)) def create_model_from_signatures(sig_csv_path, model_out, sig_datatype=np.int32): """ Takes a .csv file containing class signatures - produced by extract_features_to_csv - and uses it to train and pickle a scikit-learn model. Parameters ---------- sig_csv_path The path to the signatures file model_out The location to save the pickled model to. sig_datatype The datatype to read the csv as. Defaults to int32. Notes ----- At present, the model is an ExtraTreesClassifier arrived at by tpot: model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced') """ model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2, min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced') features, labels = load_signatures(sig_csv_path, sig_datatype) model.fit(features, labels) joblib.dump(model, model_out) def load_signatures(sig_csv_path, sig_datatype=np.int32): """ Extracts features and class labels from a signature CSV Parameters ---------- sig_csv_path sig_datatype Returns ------- features a numpy array of the shape (feature_count, sample_count) class_labels a 1d numpy array of class labels corresponding to the samples in features. """ data = np.genfromtxt(sig_csv_path, delimiter=",", dtype=sig_datatype).T return (data[1:, :].T, data[0, :]) def get_training_data(image_path, shape_path, attribute="CODE", shape_projection_id=4326): """ Given an image and a shapefile with categories, returns training data and features suitable for fitting a scikit-learn classifier. This extracts every pixel in image_path touched by the polygons in shape_path For full details of how to create an appropriate shapefile, see [here](../index.html#training_data). Parameters ---------- image_path The path to the raster image to extract signatures from shape_path The path to the shapefile containing labelled class polygons attribute The field containing the class labels shape_projection_id The projection of the shapefile Returns ------- training_data A numpy array of shape (n_pixels, bands), where n_pixels is the number of pixels covered by the training polygons features A 1-d numpy array of length (n_pixels) containing the class labels for the corresponding pixel in training_data Notes ----- For performance, this uses scikit's sparse.nonzero() function to get the location of each training data pixel. This means that this will ignore any classes with a label of '0'. """ # TODO: WRITE A TEST FOR THIS TOO; if this goes wrong, it'll go wrong # quietly and in a way that'll cause the most issues further on down the line FILL_VALUE = -9999 with TemporaryDirectory() as td: # Step 1; rasterise shapefile into .tif of class values shape_projection = osr.SpatialReference() shape_projection.ImportFromEPSG(shape_projection_id) image = gdal.Open(image_path) image_gt = image.GetGeoTransform() x_res, y_res = image_gt[1], image_gt[5] ras_path = os.path.join(td, "poly_ras") ras_params = gdal.RasterizeOptions( noData=0, attribute=attribute, xRes=x_res, yRes=y_res, outputType=gdal.GDT_Int16, outputSRS=shape_projection ) # This produces a rasterised geotiff that's right, but not perfectly aligned to pixels. # This can probably be fixed. gdal.Rasterize(ras_path, shape_path, options=ras_params) rasterised_shapefile = gdal.Open(ras_path) shape_array = rasterised_shapefile.GetVirtualMemArray() local_x, local_y = get_local_top_left(image, rasterised_shapefile) shape_sparse = sp.coo_matrix(np.asarray(shape_array).squeeze()) y, x, features = sp.find(shape_sparse) training_data = np.empty((len(features), image.RasterCount)) image_array = image.GetVirtualMemArray() image_view = image_array[:, local_y: local_y + rasterised_shapefile.RasterYSize, local_x: local_x + rasterised_shapefile.RasterXSize ] for index in range(len(features)): training_data[index, :] = image_view[:, y[index], x[index]] image_view = None image_array = None shape_array = None rasterised_shapefile = None return training_data, features def raster_reclass_binary(img_path, rcl_value, outFn, outFmt='GTiff', write_out=True): """ Takes a raster and reclassifies rcl_value to 1, with all others becoming 0. In-place operation if write_out is True. Parameters ---------- img_path Path to 1 band input raster. rcl_value Integer indication the value that should be reclassified to 1. All other values will be 0. outFn Output file name. outFmt Output format. Set to GTiff by default. Other GDAL options available. write_out Boolean. Set to True by default. Will write raster to disk. If False, only an array is returned Returns ------- Reclassifies numpy array """ log = logging.getLogger(__name__) log.info('Starting raster reclassification.') # load in classification raster in_ds = gdal.Open(img_path) in_band = in_ds.GetRasterBand(1) in_array = in_band.ReadAsArray() # reclassify in_array[in_array != rcl_value] = 0 in_array[in_array == rcl_value] = 1 if write_out: driver = gdal.GetDriverByName(outFmt) out_ds = driver.Create(outFn, in_band.XSize, in_band.YSize, 1, in_band.DataType) out_ds.SetProjection(in_ds.GetProjection()) out_ds.SetGeoTransform(in_ds.GetGeoTransform()) # Todo: Check for existing files. Skip if exists or make overwrite optional. out_ds.GetRasterBand(1).WriteArray(in_array) # write the data to disk out_ds.FlushCache() # Compute statistics on each output band # setting ComputeStatistics to false calculates stats on all pixels not estimates out_ds.GetRasterBand(1).ComputeStatistics(False) out_ds.BuildOverviews("average", [2, 4, 8, 16, 32]) out_ds = None return in_array