# Extract pixels and metadata using shapefiles and georeferenced imagery. # The purpose of this module is to generate train, test and target data # for machine learning algorithms. from __future__ import print_function from six.moves import xrange import geoio import geojson import geojson_tools as gt import numpy as np import sys import warnings from scipy.misc import imresize from itertools import cycle import osgeo.gdal as gdal from osgeo.gdalconst import * from functools import reduce def get_data(shapefile, return_labels=False, return_id=False, buffer=[0, 0], mask=False, num_chips=None): """Return pixel intensity array for each geometry in shapefile. The image reference for each geometry is found in the image_id property of the shapefile. If shapefile contains points, then buffer must have non-zero entries. The function also can also return a list of geometry ids; this is useful in case some of the shapefile entries do not produce a valid intensity array and/or class name. Args: shapefile (str): Name of shapefile in mltools geojson format. return_labels (bool): If True, then a label vector is returned. return_id (bool): if True, then the geometry id is returned. buffer (list): 2-dim buffer in PIXELS. The size of the box in each dimension is TWICE the buffer size. mask (bool): Return a masked array. num_chips (int): Maximum number of arrays to return. Returns: chips (list): List of pixel intensity numpy arrays. ids (list): List of corresponding geometry ids. labels (list): List of class names, if return_labels=True """ data, ct = [], 0 # go through point_file and unique image_id's image_ids = gt.find_unique_values(shapefile, property_name='image_id') # go through the shapefile for each image --- this is how geoio works for image_id in image_ids: # add tif extension img = geoio.GeoImage(image_id + '.tif') for chip, properties in img.iter_vector(vector=shapefile, properties=True, filter=[ {'image_id': image_id}], buffer=buffer, mask=mask): if chip is None or reduce(lambda x, y: x * y, chip.shape) == 0: continue # every geometry must have id if return_id: this_data = [chip, properties['feature_id']] else: this_data = [chip] if return_labels: try: label = properties['class_name'] if label is None: continue except (TypeError, KeyError): continue this_data.append(label) data.append(this_data) # return if max num chips is reached if num_chips: ct += 1 if ct == num_chips: return zip(*data) return zip(*data) def random_window(image, chip_size, no_chips=10000): """Implement a random chipper on a georeferenced image. Args: image (str): Image filename. chip_size (list): Array of chip dimensions. no_chips (int): Number of chips. Returns: List of chip rasters. """ img = geoio.GeoImage(image) chips = [] for i, chip in enumerate(img.iter_window_random( win_size=chip_size, no_chips=no_chips)): chips.append(chip) if i == no_chips - 1: break return chips def apply_mask(input_file, mask_file, output_file): """Apply binary mask on image. Input image and mask must have the same (x,y) dimension and the same projection. Args: input_file (str): Input file name. mask_file (str): Mask file name. output_file (str): Masked image file name. """ source_ds = gdal.Open(input_file, GA_ReadOnly) nbands = source_ds.RasterCount mask_ds = gdal.Open(mask_file, GA_ReadOnly) xsize, ysize = source_ds.RasterXSize, source_ds.RasterYSize xmasksize, ymasksize = mask_ds.RasterXSize, mask_ds.RasterYSize print('Generating mask') # Create target DS driver = gdal.GetDriverByName('GTiff') dst_ds = driver.Create(output_file, xsize, ysize, nbands, GDT_Byte) dst_ds.SetGeoTransform(source_ds.GetGeoTransform()) dst_ds.SetProjection(source_ds.GetProjection()) # Apply mask --- this is line by line at the moment, not so efficient for i in range(ysize): # read line from input image line = source_ds.ReadAsArray(xoff=0, yoff=i, xsize=xsize, ysize=1) # read line from mask mask_line = mask_ds.ReadAsArray(xoff=0, yoff=i, xsize=xsize, ysize=1) # apply mask masked_line = line * (mask_line > 0) # write for n in range(1, nbands + 1): dst_ds.GetRasterBand(n).WriteArray(masked_line[n - 1].astype(np.uint8), xoff=0, yoff=i) # close datasets source_ds, dst_ds = None, None def get_data_from_polygon_list(features, min_side_dim=0, max_side_dim=125, num_chips=None, classes=['No swimming pool', 'Swimming pool'], normalize=True, return_id=False, return_labels=True, bit_depth=8, mask=True, show_percentage=True, assert_all_valid=False, resize_dim=None, **kwargs): ''' Extract pixel intensity arrays ('chips') from image strips given a list of polygon features from a geojson file. All chips will be of uniform size. Will only return chips whose side dimension is between min_side_dim and max_side_dim. Each image strip referenced in the image_id property must be in the working directory and named as follows: <image_id>.tif. INPUTS features (list): list of polygon features from an open geojson file. IMPORTANT: Geometries must be in the same projection as the imagery! No projection checking is done! min_side_dim (int): minimum size acceptable (in pixels) for a polygon. defaults to 10. max_side_dim (int): maximum size acceptable (in pixels) for a polygon. Note that this will be the size of the height and width of all output chips. defaults to 125. num_chips (int): Maximum number of chips to return. If None, all valid chips from features will be returned. Defaults to None. classes (list['string']): name of classes for chips. Defualts to swimming pool classes (['Swimming_pool', 'No_swimming_pool']) normalize (bool): divide all chips by max pixel intensity (normalize net input). Defualts to True. return_id (bool): return the feature id with each chip. Defaults to False. return_labels (bool): Include labels in output. Labels will be numerical and correspond to the class index within the classes argument. Defualts to True. bit_depth (int): Bit depth of the imagery, necessary for proper normalization. defualts to 8 (standard for dra'd imagery). show_percentage (bool): Print percent of chips collected to stdout. Defaults to True assert_all_valid (bool): Throw an error if any of the included polygons do not match the size criteria (defined by min and max_side_dim), or are returned as None from geoio. Defaults to False. resize_dim (tup): Dimensions to reshape chips into after padding. Use for downsampling large chips. Dimensions: (n_chan, rows, cols). Defaults to None (does not resize). kwargs: ------- bands (list of ints): The band numbers (base 1) to be retrieved from the imagery. Defualts to None (all bands retrieved) buffer (int or list of two ints): Number of pixels to add as a buffer around the requested pixels. If an int, the same number of pixels will be added to both dimensions. If a list of two ints, they will be interpreted as xpad and ypad. OUTPUT chips (array): Uniformly sized chips with the following dimensions: (num_chips, num_channels, max_side_dim, max_side_dim) ids (list): Feature ids corresponding to chips. labels (array): One-hot encoded labels for chips with the follwoing dimensions: (num_chips, num_classes) ''' ct, inputs, labels, ids, nb_classes = 0, [], [], [], len(classes) total = len(features) if not num_chips else num_chips cls_dict, imgs = {classes[i]: i for i in xrange(len(classes))}, {} def write_status(ct, chip_err=False): '''helper function to write percent complete to stdout + raise AssertionError''' if show_percentage: sys.stdout.write('\r%{0:.2f}'.format(100 * (ct + 1) / float(total)) + ' ' * 20) sys.stdout.flush() if chip_err and assert_all_valid: raise AssertionError('One or more invalid polygons. Please make sure all ' \ 'polygons are valid or set assert_all_valid to False.') return ct + 1 # cycle through polygons and get pixel data for poly in features: id = poly['properties']['image_id'] coords = poly['geometry']['coordinates'][0] # open all images in geoio if id not in imgs.keys(): try: imgs[id] = geoio.GeoImage(id + '.tif') except (ValueError): raise Exception('{}.tif not found in current directory. Please make ' \ 'sure all images refereced in features are present and ' \ 'named properly'.format(str(id))) # call get_data on polygon geom chip = imgs[id].get_data_from_coords(coords, mask=mask, **kwargs) if chip is None: ct = write_status(100 * ct / float(total), ct, chip_err=True) continue # check for adequate chip size chan, h, w = np.shape(chip) pad_h, pad_w = max_side_dim - h, max_side_dim - w if min(h, w) < min_side_dim or max(h, w) > max_side_dim: ct = write_status(ct, chip_err=True) continue # zero-pad polygons to (n_bands, max_side_dim, max_side_dim) chip = chip.filled(0).astype(float) if mask else chip chip_patch = np.pad(chip, [(0, 0), (pad_h/2, (pad_h - pad_h/2)), (pad_w/2, (pad_w - pad_w/2))], 'constant', constant_values=0) # resize chip if resize_dim: new_chip = [] for band_ix in xrange(len(chip_patch)): new_chip.append(imresize(chip_patch[band_ix], resize_dim[-2:]).astype(float)) chip_patch = np.array(new_chip) # norm pixel intenisty from 0 to 1 if normalize: div = (2 ** bit_depth) - 1 chip_patch /= float(div) # get labels if return_labels: try: label = poly['properties']['class_name'] if label is None: ct = write_status(ct, chip_err=True) continue labels.append(cls_dict[label]) except (TypeError, KeyError): ct = write_status(ct, chip_err=True) continue # get feature ids if return_id: id = poly['properties']['feature_id'] ids.append(id) # append chip to inputs inputs.append(chip_patch) ct = write_status(ct) if num_chips: if len(inputs) == num_chips: break # combine data inputs = [np.array([i for i in inputs])] if return_id: inputs.append(ids) if return_labels: # format labels Y = np.zeros((len(labels), nb_classes)) for i in range(len(labels)): Y[i, labels[i]] = 1 inputs.append(Y) return inputs def get_uniform_chips(input_file, num_chips=None, **kwargs): ''' Get uniformly-sized pixel intensity arrays from image strips using a geojson file. Output will be in the same format as get_data_from_polygon_list. INPUT input_file (str): File name. This file should be filtered for polygon size num_chips (int): Maximum number of chips to return. If None will return all chips in input_file. Defaults to None kwargs: ------- See get_data_from_polygon_list docstring for other input params OUTPUT chips (array): Uniformly sized chips with the following dimensions: (num_chips, num_channels, max_side_dim, max_side_dim) ids (list): Feature ids corresponding to chips. labels (array): One-hot encoded labels for chips with the follwoing dimensions: (num_chips, num_classes) ''' # Load features from input_file with open(input_file) as f: feature_collection = geojson.load(f)['features'] if num_chips: feature_collection = feature_collection[: num_chips] return get_data_from_polygon_list(feature_collection, num_chips=num_chips, **kwargs) def uniform_chip_generator(input_file, batch_size=32, **kwargs): ''' Generate batches of uniformly-sized pixel intensity arrays from image strips using a geojson file. Output will be in the same format as get_data_from_polygon_list. INPUT input_file (str): File name batch_size (int): Number of chips to yield per iteration kwargs: ------- See get_data_from_polygon_list docstring for other input params. Do not use the num_chips arg. OUTPUT chips (array): Uniformly sized chips with the following dimensions: (num_chips, num_channels, max_side_dim, max_side_dim) ids (list): Feature ids corresponding to chips. labels (array): One-hot encoded labels for chips with the follwoing dimensions: (num_chips, num_classes) ''' # Load features from input_file with open(input_file) as f: feature_collection = geojson.load(f)['features'] # Produce batches using get_data_from_polygon_list for batch_ix in range(0, len(feature_collection), batch_size): this_batch = feature_collection[batch_ix: batch_ix + batch_size] yield get_data_from_polygon_list(this_batch, **kwargs)