python source code of numpy

import six
import logging

import numpy as np
from pandas import DataFrame, MultiIndex, Series, DatetimeIndex, Index
import pandas as pd

# Used in global scope, do not remove.
from .._config import FAST_CHECK_DF_SERIALIZABLE
from .._util import NP_OBJECT_DTYPE
from ..exceptions import ArcticException

try:  # 0.21+ Compatibility
    from pandas._libs.tslib import Timestamp
    from pandas._libs.tslibs.timezones import get_timezone
except ImportError:
    try:  # 0.20.x Compatibility
        from pandas._libs.tslib import Timestamp, get_timezone
    except ImportError:  # <= 0.19 Compatibility
        from pandas.tslib import Timestamp, get_timezone


log = logging.getLogger(__name__)
PD_VER = pd.__version__
DTN64_DTYPE = 'datetime64[ns]'


def set_fast_check_df_serializable(config):
    global FAST_CHECK_DF_SERIALIZABLE
    FAST_CHECK_DF_SERIALIZABLE = bool(config)


def _to_primitive(arr, string_max_len=None, forced_dtype=None):
    if arr.dtype.hasobject:
        if len(arr) > 0 and isinstance(arr[0], Timestamp):
            return np.array([t.value for t in arr], dtype=DTN64_DTYPE)

        if forced_dtype is not None:
            casted_arr = arr.astype(dtype=forced_dtype, copy=False)
        elif string_max_len is not None:
            casted_arr = np.array(arr.astype('U{:d}'.format(string_max_len)))
        else:
            casted_arr = np.array(list(arr))

        # Pick any unwanted data conversions (e.g. np.NaN to 'nan')
        if np.array_equal(arr, casted_arr):
            return casted_arr
    return arr


def _multi_index_to_records(index, empty_index):
    # array of tuples to numpy cols. copy copy copy
    if not empty_index:
        ix_vals = list(map(np.array, [index.get_level_values(i) for i in range(index.nlevels)]))
    else:
        # empty multi index has no size, create empty arrays for recarry.
        ix_vals = [np.array([]) for n in index.names]
    index_names = list(index.names)
    count = 0
    for i, n in enumerate(index_names):
        if n is None:
            index_names[i] = 'level_%d' % count
            count += 1
            log.info("Level in MultiIndex has no name, defaulting to %s" % index_names[i])
    index_tz = [get_timezone(i.tz) if isinstance(i, DatetimeIndex) else None for i in index.levels]
    return ix_vals, index_names, index_tz


class PandasSerializer(object):

    def _index_to_records(self, df):
        metadata = {}
        index = df.index
        index_tz = None

        if isinstance(index, MultiIndex):
            ix_vals, index_names, index_tz = _multi_index_to_records(index, len(df) == 0)
        else:
            ix_vals = [index.values]
            index_names = list(index.names)
            if index_names[0] is None:
                index_names = ['index']
                log.info("Index has no name, defaulting to 'index'")
            if isinstance(index, DatetimeIndex) and index.tz is not None:
                index_tz = get_timezone(index.tz)

        if index_tz is not None:
            metadata['index_tz'] = index_tz
        metadata['index'] = index_names

        return index_names, ix_vals, metadata

    def _index_from_records(self, recarr):
        index = recarr.dtype.metadata['index']

        if len(index) == 1:
            rtn = Index(np.copy(recarr[str(index[0])]), name=index[0])
            if isinstance(rtn, DatetimeIndex) and 'index_tz' in recarr.dtype.metadata:
                rtn = rtn.tz_localize('UTC').tz_convert(recarr.dtype.metadata['index_tz'])
        else:
            level_arrays = []
            index_tz = recarr.dtype.metadata.get('index_tz', [])
            for level_no, index_name in enumerate(index):
                # build each index level separately to ensure we end up with the right index dtype
                level = Index(np.copy(recarr[str(index_name)]))
                if level_no < len(index_tz):
                    tz = index_tz[level_no]
                    if tz is not None:
                        if not isinstance(level, DatetimeIndex) and len(level) == 0:
                            # index type information got lost during save as the index was empty, cast back
                            level = DatetimeIndex([], tz=tz)
                        else:
                            level = level.tz_localize('UTC').tz_convert(tz)
                level_arrays.append(level)
            rtn = MultiIndex.from_arrays(level_arrays, names=index)
        return rtn

    def _to_records(self, df, string_max_len=None, forced_dtype=None):
        """
        Similar to DataFrame.to_records()
        Differences:
            Attempt type conversion for pandas columns stored as objects (e.g. strings),
            as we can only store primitives in the ndarray.
            Use dtype metadata to store column and index names.

        string_max_len: integer - enforces a string size on the dtype, if any
                                  strings exist in the record
        """

        index_names, ix_vals, metadata = self._index_to_records(df)
        columns, column_vals, multi_column = self._column_data(df)

        if "" in columns:
            raise ArcticException("Cannot use empty string as a column name.")

        if multi_column is not None:
            metadata['multi_column'] = multi_column
        metadata['columns'] = columns
        names = index_names + columns

        arrays = []
        for arr, name in zip(ix_vals + column_vals, index_names + columns):
            arrays.append(_to_primitive(arr, string_max_len,
                                        forced_dtype=None if forced_dtype is None else forced_dtype[name]))

        if forced_dtype is None:
            dtype = np.dtype([(str(x), v.dtype) if len(v.shape) == 1 else (str(x), v.dtype, v.shape[1])
                              for x, v in zip(names, arrays)],
                             metadata=metadata)
        else:
            dtype = forced_dtype

        # The argument names is ignored when dtype is passed
        rtn = np.rec.fromarrays(arrays, dtype=dtype, names=names)
        # For some reason the dtype metadata is lost in the line above
        # and setting rtn.dtype to dtype does not preserve the metadata
        # see https://github.com/numpy/numpy/issues/6771

        return (rtn, dtype)

    def fast_check_serializable(self, df):
        """
        Convert efficiently the frame's object-columns/object-index/multi-index/multi-column to
        records, by creating a recarray only for the object fields instead for the whole dataframe.
        If we have no object dtypes, we can safely convert only the first row to recarray to test if serializable.
        Previously we'd serialize twice the full dataframe when it included object fields or multi-index/columns.

        Parameters
        ----------
        df: `pandas.DataFrame` or `pandas.Series`

        Returns
        -------
        `tuple[numpy.core.records.recarray, dict[str, numpy.dtype]`
            If any object dtypes are detected in columns or index will return a dict with field-name -> dtype
             mappings, and empty dict otherwise.
        """
        i_dtype, f_dtypes = df.index.dtype, df.dtypes
        index_has_object = df.index.dtype is NP_OBJECT_DTYPE
        fields_with_object = [f for f in df.columns if f_dtypes[f] is NP_OBJECT_DTYPE]
        if df.empty or (not index_has_object and not fields_with_object):
            arr, _ = self._to_records(df.iloc[:10])  # only first few rows for performance
            return arr, {}
        # If only the Index has Objects, choose a small slice (two columns if possible,
        # to avoid switching from a DataFrame to a Series)
        df_objects_only = df[fields_with_object if fields_with_object else df.columns[:2]]
        # Let any exceptions bubble up from here
        arr, dtype = self._to_records(df_objects_only)
        return arr, {f: dtype[f] for f in dtype.names}

    def can_convert_to_records_without_objects(self, df, symbol):
        # We can't easily distinguish string columns from objects
        try:
            # TODO: we can add here instead a check based on df size and enable fast-check if sz > threshold value
            if FAST_CHECK_DF_SERIALIZABLE:
                arr, _ = self.fast_check_serializable(df)
            else:
                arr, _ = self._to_records(df)
        except Exception as e:
            # This exception will also occur when we try to write the object so we fall-back to saving using Pickle
            log.warning('Pandas dataframe %s caused exception "%s" when attempting to convert to records. '
                        'Saving as Blob.' % (symbol, repr(e)))
            return False
        else:
            if arr.dtype.hasobject:
                log.warning('Pandas dataframe %s contains Objects, saving as Blob' % symbol)
                # Fall-back to saving using Pickle
                return False
            elif any([len(x[0].shape) for x in arr.dtype.fields.values()]):
                log.warning('Pandas dataframe %s contains >1 dimensional arrays, saving as Blob' % symbol)
                return False
            else:
                return True

    def serialize(self, item, string_max_len=None, forced_dtype=None):
        raise NotImplementedError

    def deserialize(self, item, force_bytes_to_unicode=False):
        raise NotImplementedError


class SeriesSerializer(PandasSerializer):
    TYPE = 'series'

    def _column_data(self, s):
        if s.name is None:
            log.info("Series has no name, defaulting to 'values'")
        columns = [s.name if s.name else 'values']
        column_vals = [s.values]
        return columns, column_vals, None

    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        name = item.dtype.names[-1]
        data = item[name]

        if force_bytes_to_unicode:
            if six.PY2 and isinstance(name, (bytes, str)):
                name = name.decode('utf-8')

            if len(data) and isinstance(data[0], bytes):
                data = data.astype('unicode')

            if isinstance(index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(index.levels)):
                    _index = index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                index = unicode_indexes
            else:
                if len(index) and type(index[0]) == bytes:
                    index = index.astype('unicode')

        if PD_VER < '0.23.0':
            return Series.from_array(data, index=index, name=name)
        else:
            return Series(data, index=index, name=name)

    def serialize(self, item, string_max_len=None, forced_dtype=None):
        return self._to_records(item, string_max_len, forced_dtype)


class DataFrameSerializer(PandasSerializer):
    TYPE = 'df'

    def _column_data(self, df):
        columns = list(map(str, df.columns))
        if columns != list(df.columns):
            log.info("Dataframe column names converted to strings")
        column_vals = [df[c].values for c in df.columns]

        if isinstance(df.columns, MultiIndex):
            ix_vals, ix_names, _ = _multi_index_to_records(df.columns, False)
            vals = [list(val) for val in ix_vals]
            str_vals = [list(map(str, val)) for val in ix_vals]
            if vals != str_vals:
                log.info("Dataframe column names converted to strings")
            return columns, column_vals, {"names": list(ix_names), "values": str_vals}
        else:
            return columns, column_vals, None

    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if not df.columns.empty and type(df.columns[0]) == bytes:
                df.columns = df.columns.astype('unicode')

        return df

    def serialize(self, item, string_max_len=None, forced_dtype=None):
        return self._to_records(item, string_max_len, forced_dtype)