# Contains functions for manipulating jsons and geojsons. from __future__ import print_function import geojson import numpy as np import geoio import sys import random import subprocess import os from shapely.wkb import loads def join(input_files, output_file): ''' Join geojsons into one. The spatial reference system of the output file is the same as the one of the last file in the list. Args: input_files (list): List of file name strings. output_file (str): Output file name. ''' # get feature collections final_features = [] for file in input_files: with open(file) as f: feat_collection = geojson.load(f) final_features += feat_collection['features'] feat_collection['features'] = final_features # write to output file with open(output_file, 'w') as f: geojson.dump(feat_collection, f) def split(input_file, file_1, file_2, no_in_first_file): ''' Split a geojson in two separate files. Args: input_file (str): Input filename. file_1 (str): Output file name 1. file_2 (str): Output file name 2. no_features (int): Number of features in input_file to go to file_1. output_file (str): Output file name. ''' # get feature collection with open(input_file) as f: feat_collection = geojson.load(f) features = feat_collection['features'] feat_collection_1 = geojson.FeatureCollection(features[0:no_in_first_file]) feat_collection_2 = geojson.FeatureCollection(features[no_in_first_file:]) with open(file_1, 'w') as f: geojson.dump(feat_collection_1, f) with open(file_2, 'w') as f: geojson.dump(feat_collection_2, f) def get_from(input_file, property_names): ''' Reads a geojson and returns a list of value tuples, each value corresponding to a property in property_names. Args: input_file (str): File name. property_names: List of strings; each string is a property name. Returns: List of value tuples. ''' # get feature collections with open(input_file) as f: feature_collection = geojson.load(f) features = feature_collection['features'] values = [tuple([feat['properties'].get(x) for x in property_names]) for feat in features] return values def write_to(data, property_names, output_file): ''' Write list of tuples to geojson. First entry of each tuple should be geometry in hex coordinates and the rest properties. Args: data: List of tuples. property_names: List of strings. Should be same length as the number of properties. output_file (str): Output file name. ''' geojson_features = [] for entry in data: coords_in_hex, properties = entry[0], entry[1:] geometry = loads(coords_in_hex, hex=True) property_dict = dict(zip(property_names, properties)) if geometry.geom_type == 'Polygon': coords = [list(geometry.exterior.coords)] # brackets required geojson_feature = geojson.Feature(geometry=geojson.Polygon(coords), properties=property_dict) elif geometry.geom_type == 'Point': coords = list(geometry.coords)[0] geojson_feature = geojson.Feature(geometry=geojson.Point(coords), properties=property_dict) geojson_features.append(geojson_feature) feature_collection = geojson.FeatureCollection(geojson_features) with open(output_file, 'wb') as f: geojson.dump(feature_collection, f) def write_properties_to(data, property_names, input_file, output_file, filter=None): ''' Writes property data to polygon_file for all geometries indicated in the filter, and creates output file. The length of data must be equal to the number of geometries in the filter. Existing property values are overwritten. Args: data (list): List of tuples. Each entry is a tuple of dimension equal to property_names. property_names (list): Property names. input_file (str): Input file name. output_file (str): Output file name. filter (dict): Filter format is {'property_name':[value1,value2,...]}. What this achieves is to write the first entry of data to the properties of the feature with 'property_name'=value1, and so on. This makes sense only if these values are unique. If Filter=None, then data is written to all geometries in the input file. ''' with open(input_file) as f: feature_collection = geojson.load(f) features = feature_collection['features'] if filter is None: for i, feature in enumerate(features): for j, property_value in enumerate(data[i]): feature['properties'][property_names[j]] = property_value else: filter_name = filter.keys()[0] filter_values = np.array(filter.values()[0]) for feature in features: compare_value = feature['properties'][filter_name] ind = np.where(filter_values == compare_value)[0] if len(ind) > 0: for j, property_value in enumerate(data[ind][0]): feature['properties'][property_names[j]] = property_value feature_collection['features'] = features with open(output_file, 'w') as f: geojson.dump(feature_collection, f) def find_unique_values(input_file, property_name): ''' Find unique values of a given property in a geojson file. Args: input_file (str): File name. property_name (str): Property name. Returns: List of distinct values of property. If property does not exist, it returns None. ''' with open(input_file) as f: feature_collection = geojson.load(f) features = feature_collection['features'] values = np.array([feat['properties'].get(property_name) for feat in features]) return np.unique(values) def filter_by_property(input_file, output_file, property_name, values): ''' Create a file containing only features with specified property value(s) from input_file. INPUT input_file (str): File name. output_file (str): Output file name. property_name (str): Name of the feature property to filter by. values (list): Value(s) a feature may have for property_name if it is to be included in output_file. ''' filtered_feats = [] if not output_file.endswith('.geojson'): output_file += '.geojson' # Load feature list with open(input_file) as f: feature_collection = geojson.load(f) # Filter feats by property_name for feat in feature_collection['features']: if feat['properties'][property_name] in values: filtered_feats.append(feat) feature_collection['features'] = filtered_feats # Save filtered file with open(output_file, 'wb') as f: geojson.dump(f) def create_train_test(input_file, output_file=None, test_size=0.2): ''' Split a geojson file into train and test features. Saves features as geojsons in the working directory under the same file name with train and test prefixes to the original file name. INPUT input_file (str): File name output_file (str): Name to use after the train_ and test_ prefixes for the saved files. Defaults to name of input_file. test_size (float or int): Amount of features to set aside as test data. If less than one will be interpreted as a proportion of the total feature collection. Otherwise it is the amount of features to use as test data. Defaults to 0.2. ''' with open(input_file) as f: data = geojson.load(f) features = data['features'] np.random.shuffle(features) # Convert test size from proportion to number of polygons if test_size <= 1: test_size = int(test_size * len(features)) # Name output files if not output_file: output_file = input_file elif not output_file.endswith('.geojson'): output_file += '.geojson' test_out, train_out = 'test_{}'.format(output_file), 'train_{}'.format(output_file) # Save train and test files data['features'] = features[:test_size] with open(test_out, 'wb') as test_file: geojson.dump(data, test_file) data['features'] = features[test_size:] with open(train_out, 'wb') as train_file: geojson.dump(data, train_file) def create_balanced_geojson(input_file, classes, output_file='balanced.geojson', samples_per_class=None): ''' Create a geojson comprised of balanced classes from input_file for training data. Randomly selects polygons from all classes. INPUT input_file (str): File name classes (list[str]): Classes in input_file to include in the balanced output file. Must exactly match the 'class_name' property in the features of input_file. output_file (str): Name under which to save the balanced output file. Defualts to balanced.geojson. samples_per_class (int or None): Number of features to select per class in input_file. If None will use the smallest class size. Defaults to None. ''' if not output_file.endswith('.geojson'): output_file += '.geojson' with open(input_file) as f: data = geojson.load(f) # Sort classes in separate lists sorted_classes = {clss : [] for clss in classes} for feat in data['features']: try: sorted_classes[feat['properties']['class_name']].append(feat) except (KeyError): continue # Determine sample size per class if not samples_per_class: smallest_class = min(sorted_classes, key=lambda clss: len(sorted_classes[clss])) samples_per_class = len(sorted_classes[smallest_class]) # Randomly select features from each class try: samps = [random.sample(feats, samples_per_class) for feats in sorted_classes.values()] final = [feat for sample in samps for feat in sample] except (ValueError): raise Exception('Insufficient features in at least one class. Set ' \ 'samples_per_class to None to use maximum amount of '\ 'features.') # Shuffle and save balanced data np.random.shuffle(final) data['features'] = final with open(output_file, 'wb') as f: geojson.dump(data, f) def filter_polygon_size(input_file, output_file, min_side_dim=0, max_side_dim=125, shuffle=False, make_omitted_files=False): ''' Create a geojson file containing only polygons with acceptable side dimensions. INPUT input_file (str): File name output_file (str): Name under which to save filtered polygons. min_side_dim (int): Minimum acceptable side length (in pixels) for each polygon. Defaults to 0. max_side_dim (int): Maximum acceptable side length (in pixels) for each polygon. Defaults to 125. shuffle (bool): Shuffle polygons before saving to output file. Defaults to False. make_omitted_files (bool): Create files with omitted polygons. Two files are created: one with polygons that are too small and one with large polygons. Defaults to False. ''' def write_status(percent_complete): '''helper function to write percent complete to stdout''' sys.stdout.write('\r%{0:.2f}'.format(percent_complete) + ' ' * 20) sys.stdout.flush() # load polygons with open(input_file) as f: data = geojson.load(f) total_features = float(len(data['features'])) # format output file name if not output_file.endswith('.geojson'): output_file += '.geojson' # find indicies of acceptable polygons ix_ok, small_ix, large_ix = [], [], [] img_ids = find_unique_values(input_file, property_name='image_id') print('Filtering polygons... \n') for img_id in img_ids: ix = 0 print('... for image {} \n'.format(img_id)) img = geoio.GeoImage(img_id + '.tif') # create vrt if img has multiple bands (more efficient) if img.shape[0] > 1: vrt_flag = True vrt_cmd = 'gdalbuildvrt tmp.vrt -b 1 {}.tif'.format(img_id) subprocess.call(vrt_cmd, shell=True) #saves temporary vrt file to filter on img = geoio.GeoImage('tmp.vrt') # cycle thru polygons for chip, properties in img.iter_vector(vector=input_file, properties=True, filter=[{'image_id': img_id}], mask=True): ix += 1 if chip is None: write_status(100 * ix / total_features) continue chan,h,w = np.shape(chip) # Identify small chips if min(h, w) < min_side_dim: small_ix.append(ix - 1) write_status(100 * ix / total_features) continue # Identify large chips elif max(h, w) > max_side_dim: large_ix.append(ix - 1) write_status(100 * ix / total_features) continue # Identify valid chips ix_ok.append(ix - 1) write_status(100 * ix / total_features) # remove vrt file if vrt_flag: os.remove('tmp.vrt') # save new geojson ok_polygons = [data['features'][i] for i in ix_ok] small_polygons = [data['features'][i] for i in small_ix] large_polygons = [data['features'][i] for i in large_ix] print(str(len(small_polygons)) + ' small polygons removed') print(str(len(large_polygons)) + ' large polygons removed') if shuffle: np.random.shuffle(ok_polygons) data['features'] = ok_polygons with open(output_file, 'wb') as f: geojson.dump(data, f) if make_omitted_files: # make file with small polygons data['features'] = small_polygons with open('small_' + output_file, 'w') as f: geojson.dump(data, f) # make file with large polygons data['features'] = large_polygons with open('large_' + output_file, 'w') as f: geojson.dump(data, f) print('Saved {} polygons to {}'.format(str(len(ok_polygons)), output_file))