python source code of trajectory

from joblib import Parallel, delayed
from sklearn.utils import gen_even_slices
import numpy as np


class TrajectoryData(object):
    """Trajectory data wrapper.

    Parameters
    ----------
    attributes : array-like
        The names of attributes/features describing trajectory points in the
        dataset.
    data : array-like, shape: (n_trajectories, n_points, n_features)
        The trajectory data.
    tids : array-like
        The corresponding trajectory IDs of trajectories in ``data``.
    labels : array-like (default=None)
        The corresponding labels of trajectories in ``data``.
    """

    def __init__(self, attributes, data, tids, labels=None):
        self.attributes = attributes
        self.tids = np.array(tids)
        self.labels = np.array(labels) if labels is not None else None
        self.data = np.array(data)
        self._stats = None
        self.tidToIdx = dict(zip(tids, np.r_[0:len(tids)]))
        self.labelToIdx = TrajectoryData._get_label_to_idx(labels)

    def get_attributes(self):
        """Retrieves the attributes in the dataset.

        Returns
        -------
        attributes : array
            An array of length `n_features`.
        """
        return self.attributes

    def get_tids(self, label=None):
        """Retrieves the trajectory IDs in the dataset.

        Parameters
        ----------
        label : int (default=None)
            If `None`, then retrieves all trajectory IDs. Otherwise, returns
            the IDs corresponding to the given label.

        Returns
        -------
        attributes : array
            An array of length `n_trajectories`.
        """
        if not label or self.labels is None:
            return self.tids

        idxs = self.labelToIdx[label]
        return self.tids[idxs]

    def get_label(self, tid):
        """Retrieves the label for the corresponding tid.

        Parameters
        ----------
        tid : int
            The trajectory ID.

        Returns
        -------
        label : int or str
            The corresponding label.
        """
        return self.labels[self.tidToIdx[tid]]

    def get_labels(self, unique=False):
        """Retrieves the labels of the trajectories in the dataset.

        Parameters
        ----------
        unique : bool (default=False)
            If ``True``, then the set of unique labels is returned. Otherwise,
            an array with the labels of each individual trajectory is returned.

        Returns
        -------
        labels : array
            An array of length `n_trajectories` if `unique=False`, and of
            length `n_labels` otherwise.
        """
        if unique and self.labels is not None:
            return sorted(list(set(self.labels)))

        return self.labels

    def get_trajectory(self, tid):
        """Retrieves a trajectory from the dataset.

        Parameters
        ----------
        tid : int
            The trajectory ID.

        Returns
        -------
        trajectory : array, shape: (n_points, n_features)
            The corresponding trajectory.
        """
        return self.data[self.tidToIdx[tid]]

    def get_trajectories(self, label=None):
        """Retrieves multiple trajectories from the dataset.

        Parameters
        ----------
        label : int (default=None)
            The label of the trajectories to be retrieved. If ``None``, then
            all trajectories are retrieved.

        Returns
        -------
        trajectories : array
            The trajectories of the given label. If `label=None` or if the
            dataset does not contain labels, then all trajectories are
            returned.
        """
        if not label or self.labels is None:
            return self.data

        idxs = self.labelToIdx[label]
        return self.data[idxs]

    def length(self):
        """Returns the number of trajectories in the dataset.

        Returns
        -------
        length : int
            Number of trajectories in the dataset.
        """
        return len(self.tids)

    def merge(self, other, ignore_duplicates=True, inplace=True):
        """Merges this trajectory data with another one. Notice that this
        method only works if the datasets have the same set of attributes.

        Parameters
        ----------
        other : :class:`trajminer.TrajectoryData`
            The dataset to be merged with.
        ignore_duplicates : bool (default=True)
            If `True`, then trajectory IDs in `other` that already exist in
            `self` are ignored. Otherwise, raises an exception when a duplicate
            is found.
        inplace : bool (default=True)
            If `True` modifies the current object, otherwise returns a new
            object.

        Returns
        -------
        dataset : :class:`trajminer.TrajectoryData`
            The merged dataset. If `inplace=True`, then returns the modified
            current object.
        """
        if set(self.attributes) != set(other.attributes):
            raise Exception("Cannot merge datasets with different sets of " +
                            "attributes!")

        n_attributes = self.attributes
        n_tids = self.tids.tolist()
        n_labels = self.labels.tolist()
        n_data = self.data.tolist()

        for tid in other.tids:
            if tid in n_tids:
                if ignore_duplicates:
                    continue
                raise Exception("tid", tid, "already exists in 'self'!")
            n_tids.append(tid)
            n_data.append(other.get_trajectory(tid))

            if n_labels is not None:
                n_labels.append(other.get_label(tid))

        if inplace:
            self._update(n_attributes, n_data, n_tids, n_labels)
            return self

        return TrajectoryData(n_attributes, n_data, n_tids, n_labels)

    def to_file(self, file, file_type='csv', n_jobs=1):
        """Persists the dataset to a file.

        Parameters
        ----------
        file : str
            The output file.
        file_type : str (default='csv')
            The file type. Must be one of `{csv}`.
        n_jobs : int (default=1)
            The number of parallel jobs.
        """
        if file_type == 'csv':
            self._to_csv(file, n_jobs)

    def stats(self, print_stats=False):
        """Computes statistics for the dataset.

        Parameters
        ----------
        print_stats : bool (default=False)
            If `True`, stats are printed.

        Returns
        -------
        stats : dict
            A dictionary containing the dataset statistics.
        """
        if self._stats:
            if print_stats:
                self._print_stats()
            return self._stats

        traj_lengths = np.array([len(x) for x in self.data])
        points = np.concatenate(self.data)

        def count_not_none(arr):
            return sum([1 if x is not None else 0 for x in arr])

        attr_count = np.array([count_not_none(p) for p in points])

        self._stats = {
            'attribute': {
                'count': len(self.attributes),
                'min': attr_count.min(),
                'avg': attr_count.mean(),
                'std': attr_count.std(),
                'max': attr_count.max()
            },
            'point': {
                'count': traj_lengths.sum()
            },
            'trajectory': {
                'count': len(self.data),
                'length': {
                    'min': traj_lengths.min(),
                    'avg': traj_lengths.mean(),
                    'std': traj_lengths.std(),
                    'max': traj_lengths.max()
                }
            }
        }

        if self.labels is not None:
            unique, counts = np.unique(self.labels, return_counts=True)
            self._stats['label'] = {
                'count': len(unique),
                'min': counts.min(),
                'avg': counts.mean(),
                'std': counts.std(),
                'max': counts.max()
            }

        if print_stats:
            self._print_stats()
        return self._stats

    def _update(self, attributes, data, tids, labels):
        self.tids = np.array(tids)
        self.labels = np.array(labels)
        self.data = np.array(data)
        self.tidToIdx = dict(zip(tids, np.r_[0:len(tids)]))
        self.labelToIdx = TrajectoryData._get_label_to_idx(labels)
        self._stats = None

    def _to_csv(self, file, n_jobs):
        lat_lon = -1
        tids = self.get_tids()

        def build_lines(s):
            lines = []
            for i in range(s.start, s.stop):
                tid = tids[i]
                label = self.get_label(tid)
                traj = self.get_trajectory(tid)

                for p in traj:
                    if lat_lon > -1:
                        p[lat_lon] = str(p[lat_lon][0]) + \
                            ',' + str(p[lat_lon][1])
                    fmt = str(p)[1:-1].replace(', ', ',').replace("'", '')
                    lines.append(str(tid) + ',' + str(label) + ',' + fmt)
            return lines

        with open(file, 'w') as out:
            header = 'tid,label'

            for i, attr in enumerate(self.get_attributes()):
                if attr == 'lat_lon':
                    header += ',lat,lon'
                    lat_lon = i
                else:
                    header += ',' + attr

            out.write(header + '\n')
            func = delayed(build_lines)
            lines = Parallel(n_jobs=n_jobs, verbose=0)(
                func(s) for s in gen_even_slices(len(tids), n_jobs))

            lines = np.concatenate(lines)
            lines = '\n'.join(lines)
            out.write(lines)
            out.close()

    def _print_stats(self):
        print('==========================================================')
        print('                           STATS                          ')
        print('==========================================================')
        print('ATTRIBUTE')
        print('  Count:           ', self._stats['attribute']['count'])
        print('  Min:             ', self._stats['attribute']['min'])
        print('  Max:             ', self._stats['attribute']['max'])
        print('  Avg ± Std:        %.4f ± %.4f' % (
            self._stats['attribute']['avg'], self._stats['attribute']['std']))

        print('\nPOINT')
        print('  Count:           ', self._stats['point']['count'])

        print('\nTRAJECTORY')
        print('  Count:           ', self._stats['trajectory']['count'])
        print('  Min length:      ',
              self._stats['trajectory']['length']['min'])
        print('  Max lenght:      ',
              self._stats['trajectory']['length']['max'])
        print('  Avg length ± Std: %.4f ± %.4f' %
              (self._stats['trajectory']['length']['avg'],
               self._stats['trajectory']['length']['std']))

        if self.labels is not None:
            print('\nLABEL')
            print('  Count:           ', self._stats['label']['count'])
            print('  Min:             ', self._stats['label']['min'])
            print('  Max:             ', self._stats['label']['max'])
            print('  Avg ± Std:        %.4f ± %.4f' % (
                self._stats['label']['avg'], self._stats['label']['std']))
            print('==========================================================')
        else:
            print('==========================================================')

    @staticmethod
    def _get_label_to_idx(labels):
        labelToIdx = None
        if labels is not None:
            labelToIdx = {}
            for i, label in enumerate(labels):
                if label in labelToIdx:
                    labelToIdx[label].append(i)
                else:
                    labelToIdx[label] = [i]

        return labelToIdx