# Copyright 2017 Google Inc. and Skytruth Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import datetime import pytz import logging import numpy as np import six import time EPOCH_DT = datetime.datetime(1970, 1, 1, tzinfo=pytz.utc) def np_pad_repeat_slice(slice, window_size): """ Pads slice to the specified window size. Series that are shorter than window_size are repeated into unfilled space. Args: slice: a numpy array. window_size: the size the array must be padded to. Returns: a numpy array of length window_size in the first dimension. """ slice_length = len(slice) assert (slice_length <= window_size) reps = int(np.ceil(window_size / float(slice_length))) if reps > 1: return np.concatenate([slice] * reps, axis=0)[:window_size] else: return slice[:window_size].copy() def np_zero_pad_slice(slice, window_size, random_state): """ Pads slice to the specified window size. Series that are shorter than window_size are repeated into unfilled space. Args: slice: np.array. window_size: int Size the array must be padded to. random_state: np.RandomState Returns: a numpy array of length window_size in the first dimension. TODO: this function has an inaccurate name, really this is doing pad_repeat_slice with a random offset for data augmentation. Rename or remove at next cleanup since it doesn't appear to be used.. """ slice_length = len(slice) delta = window_size - slice_length assert delta >= 0 offset = random_state.randint(0, delta + 1) return np.concatenate([slice] * reps, axis=0)[offset:offset+window_size] def np_pad_repeat_slice_2(slice, window_size, random_state): """ Pads slice to the specified window size then rolls them. Series that are shorter than window_size are repeated into unfilled space, then the series is randomly rolled along the time axis to generate more training diversity. This has the side effect of adding a non-physical seam in the data, but in practice seems to work better than not rolling. Similar to `np_pad_repeat_slice` except for rolling the sequence along the time axis. Args: slice: a numpy array. window_size: the size the array must be padded to. random_state: a numpy RandomState object. Returns: a numpy array of length window_size in the first dimension. """ slice_length = len(slice) delta = window_size - slice_length assert delta >= 0 slice = slice.copy() GAP_LOGDT = 100 slice[0, 1] = GAP_LOGDT reps = int(np.ceil(window_size / float(slice_length))) repeated = np.concatenate([slice] * reps, axis=0) offset = random_state.randint(0, window_size) return np.roll(repeated, offset, axis=0)[:window_size] def np_array_random_fixed_length_extract(random_state, input_series, output_length): cropped = input_series[start_offset:end_offset] return np_pad_repeat_slice_2(cropped, output_length, random_state) def empty_data(window_size, series): num_features_inc_timestamp = series.shape[1] return (np.empty( [0, 1, window_size, num_features_inc_timestamp - 1], dtype=np.float32), np.empty( shape=[0, window_size], dtype=np.int32), np.empty( shape=[0, 2], dtype=np.int32), np.empty( shape=[0], dtype=str)) def cook_features(features, id_): """Convert raw features into something the model can digest Args: features: np.array of raw features id_ : the id associated with these features. Returns: (2D np.array of features for the model, 2D np.array of timestamps, (start_time, end_time) as seconds from Unix epoch. str id of vessel) """ # We use min and max here to account for possible rolling / replicating # that goes on elsewhere. start_time = int(features[:, 0].min()) end_time = int(features[:, 0].max()) # Drop the first (timestamp) column. timestamps = features[:, 0].astype(np.int32) features = features[:, 1:] if not np.isfinite(features).all(): logging.fatal('Bad features: %s', features) return (np.stack([features]), timestamps, np.array([start_time, end_time], dtype=np.int32), id_) def extract_n_random_fixed_points(random_state, input_series, n, output_length, id_, selection_ranges): """ Extracts a n, random fixed-points slice from a 2d numpy array. The input array must be 2d, representing a time series, with the first column representing a timestamp (sorted ascending). Args: random_state: a numpy randomstate object. input_series: the input series. A 2d array first column representing an ascending time. n: the number of series to extract output_length: the number of points in the output series. Input series shorter than this will be repeated into the output series. id_: the id of the vessel selection_ranges: Either a list of time ranges that should be preferentially selected from (we try to get at least on point from one of the ranges), or None if to disable this behaviour. Returns: An array of the same depth as the input, but altered width, representing the fixed points slice. """ input_length = len(input_series) if input_length < output_length: return empty_data(output_length, input_series) # Set of points where it would make sense to start a range. candidate_set = set() starts = [(x.start_time - EPOCH_DT).total_seconds() for x in selection_ranges] starts_ndxs = np.searchsorted(input_series[:, 0], starts, side='left') ends = [(x.end_time - EPOCH_DT).total_seconds() for x in selection_ranges] end_ndxs = np.searchsorted(input_series[:, 0], ends, side='right') for start_ndx, end_ndx in zip(starts_ndxs, end_ndxs): valid_start = max(0, start_ndx - output_length + 1) valid_end = min(input_length - output_length + 1, end_ndx) candidate_set.update(range(valid_start, valid_end)) candidates = list(candidate_set) if len(candidates) == 0: return empty_data(output_length, input_series) samples = [] for _ in range(n): start_index = random_state.choice(candidates) end_index = start_index + output_length samples.append(cook_features(input_series[start_index:end_index], id_)) return list(zip(*samples)) def setup_cook_features_into(n, features_shape): s0, s1 = features_shape features = np.empty((n, s0, s1 - 1), np.float32) timestamps = np.empty([n, s0], np.int32) ranges = np.empty([n, 2], np.int32) ids = np.empty([n], object) return features, timestamps, ranges, ids def cook_features_into(arrays, i, data, id_, range_=None): """Convert raw features into something the model can digest Args: arrays: (features, timestamps, ranges) i : index to inserrt into features: np.array of raw features range: tuple, optional (start, stop) timestamps, if not supplied, this is derived from features. Returns: (2D np.array of features for the model, 2D np.array of timestamps, (start_time, end_time) as seconds from Unix epoch. str id of vessel) """ features, timestamps, ranges, ids = arrays features[i] = data[:, 1:] timestamps[i] = data[:, 0] if range_ is None: ranges[i, 0] = int(data[:, 0].min()) ranges[i, 1] = int(data[:, 0].max()) else: ranges[i, :] = range_ ids[i] = id_ def extract_n_random_fixed_times(random_state, input_series, n, max_time_delta, output_length, id_, min_timeslice_size): """ Extracts a random fixed-time slice from a 2d numpy array. The input array must be 2d, representing a time series, with the first column representing a timestamp (sorted ascending). Any values in the series with a time greater than (first time + max_time_delta) are removed and the prefix series repeated into the window to pad. Args: random_state: a numpy randomstate object. input_series: the input series. A 2d array first column representing an ascending time. n: the number of series to extract max_time_delta: the maximum duration of the returned timeseries in seconds. output_length: the number of points in the output series. Input series shorter than this will be repeated into the output series. min_timeslice_size: the minimum number of points in a timeslice for the series to be considered meaningful. Returns: An array of the same depth as the input, but altered width, representing the fixed time slice. """ assert max_time_delta != 0, 'max_time_delta must be non zero for time based windows' input_length = len(input_series) if input_length < min_timeslice_size: return empty_data(output_length, input_series) min_time = input_series[0, 0] - (output_length - min_timeslice_size) max_ndx = input_length - min_timeslice_size max_time_due_to_ndx = input_series[max_ndx, 0] max_time_due_to_time = input_series[-1, 0] - max_time_delta max_time = min(max_time_due_to_ndx, max_time_due_to_time) if max_time < min_time: return empty_data(output_length, input_series) # TODO: clarify by breaking into two function arrays = setup_cook_features_into(n, (output_length, input_series.shape[-1])) for i in range(n): start_time = random_state.randint(min_time, max_time + 1) start_index = np.searchsorted(input_series[:, 0], start_time, side='left') end_index = start_index + output_length cropped = input_series[start_index:end_index] # Might only have min_timeslice_size points padded = np_pad_repeat_slice(cropped, output_length) cook_features_into(arrays, i, padded, id_) return arrays def np_array_extract_slices_for_time_ranges( random_state, input_series, id_, time_ranges, window_size, min_points_for_classification): """ Extract and process a set of specified time slices from a vessel movement feature. Args: random_state: a numpy randomstate object. input: the input data as a 2d numpy array. id_: the id of the vessel which made this series. max_time_delta: the maximum time contained in each window. window_size: the size of the window. min_points_for_classification: the minumum number of points in a window for it to be usable. Returns: A tuple comprising: 1. An numpy array comprising N feature timeslices, of dimension [n, 1, window_size, num_features]. 2. A numpy array with the timestamps for each feature point, of dimension [n, window_size]. 3. A numpy array comprising timebounds for each slice, of dimension [n, 2]. 4. A numpy array with an int64 id for each slice, of dimension [n]. """ times = input_series[:, 0] ndx = 0 arrays = setup_cook_features_into(len(time_ranges), (window_size, input_series.shape[-1])) for (start_time, end_time) in time_ranges: start_index = np.searchsorted(times, start_time, side='left') end_index = np.searchsorted(times, end_time, side='left') length = end_index - start_index # Slice out the time window. cropped = input_series[start_index:end_index] # If this window is too long, pick a random subwindow. if (length > window_size): max_offset = length - window_size start_offset = random_state.uniform(max_offset) cropped = cropped[max_offset:max_offset + window_size] if len(cropped) >= min_points_for_classification: padded = np_pad_repeat_slice(cropped, window_size) cook_features_into(arrays, ndx, padded, id_, (start_time, end_time)) ndx += 1 return tuple(x[:ndx] for x in arrays) def np_array_extract_all_fixed_slices(input_series, num_features, id_, window_size, shift): slices = [] input_length = len(input_series) for end_index in range(input_length, 0, -shift): start_index = end_index - window_size if start_index < 0: if start_index != -shift: logging.warning('input not correctly padded, dropping start') continue cropped = input_series[start_index:end_index] start_time = int(cropped[0][0]) end_time = int(cropped[-1][0]) time_bounds = np.array([start_time, end_time], dtype=np.int32) assert len(cropped) == window_size slices.append(cook_features(cropped, id_)) if slices == []: return empty_data(window_size, input_series) return list(zip(*slices)) def process_fixed_window_features(random_state, features, id_, num_features, window_size, shift, start_date, end_date, win_start, win_end): """Extract sequential sets of features with a given window size. This will return multiple sets of training features. Each set will be `window_size` in length and each successive set will be shifted by `shift`. The algorithm will attempt to place the point `win_start` within the first window at `start_data` and `win_end` within the last window at `end_data`. Note that `shift` must equal `win_end `- `win_start`. Args: random_state: a numpy randomstate object. features: the input data as a 2d numpy array. id_: the id of the vessel which made this series. num_features: the number of features in the raw features. window_size: the size of the window in points. shift: how many points the window should shift between feature sets. start_date: Attempt to place first point of first window here. end_data: Attempt to place last point of last window here. win_start: first point within in the window where we extact data. win_end: last point within the window where we extract data. """ assert win_end - win_start == shift + 1, (win_end, win_start, shift) pad_end = window_size - win_end pad_start = win_start if len(features) == 0: return empty_data(window_size, features) is_sorted = all(features[i, 0] <= features[i+1, 0] for i in six.moves.range(len(features)-1)) assert is_sorted if start_date is not None: start_stamp = time.mktime(start_date.timetuple()) if end_date is not None: end_stamp = time.mktime(end_date.timetuple()) if end_date is not None: raw_end_i = np.searchsorted(features[:, 0], end_stamp, side='right') # How much padding do we need: available = len(features) - raw_end_i n_pad_end = max(pad_end - available, 0) else: raw_end_i = len(features) n_pad_end = pad_end end_i = raw_end_i + pad_end # \/ raw_start_i # ppBBBBpp<--- shift # ppBBBBpp.........ppBBBBpp # end_i - winsize^ ^ end_i if start_date is not None: # If possible go to shift before pad so we have good data for whole length raw_start_i = np.searchsorted(features[:, 0], start_stamp, side='left') else: raw_start_i = 0 assert raw_start_i >= 0 start_i = raw_start_i - pad_start if start_i >= len(features) or end_i < 1: # No features overlap with window return empty_data(window_size, features) while (end_i - start_i - window_size) % shift: start_i -= 1 if n_pad_end > 0: features = np.concatenate([features, n_pad_end * [features[-1]]], axis=0) else: features = features[:end_i] assert len(features) == end_i if start_i < 0: features = np.concatenate([features, (-start_i) * [features[0]]], axis=0) else: features = features[start_i:] assert len(features) == end_i - start_i assert (end_i - start_i - window_size) % shift == 0 return np_array_extract_all_fixed_slices(features, num_features, id_, window_size, shift)