# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import six import json import isodate import datetime import tableschema import numpy as np import pandas as pd # Starting from pandas@0.24 there is the new API # https://github.com/frictionlessdata/tableschema-pandas-py/issues/29 try: import pandas.core.dtypes.api as pdc except ImportError: import pandas.core.common as pdc # Module API class Mapper(object): # Public def convert_descriptor_and_rows(self, descriptor, rows): """Convert descriptor and rows to Pandas """ schema = tableschema.Schema(descriptor) # Get data/index data_rows = [] index_rows = [] jtstypes_map = {} for row in rows: data_values = [] index_values = [] for field, value in zip(schema.fields, row): try: if isinstance(value, float) and np.isnan(value): value = None if value and field.type == 'integer': value = int(value) value = field.cast_value(value) except tableschema.exceptions.CastError: value = json.loads(value) # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na if value is None and field.type in ('number', 'integer'): jtstypes_map[field.name] = 'number' value = np.NaN if field.name in schema.primary_key: index_values.append(value) else: data_values.append(value) if len(schema.primary_key) == 1: index_rows.append(index_values[0]) elif len(schema.primary_key) > 1: index_rows.append(tuple(index_values)) data_rows.append(tuple(data_values)) # Create index index = None if schema.primary_key: if len(schema.primary_key) == 1: index_class = pd.Index index_field = schema.get_field(schema.primary_key[0]) index_dtype = self.convert_type(index_field.type) if field.type in ['datetime', 'date']: index_class = pd.DatetimeIndex index = index_class(index_rows, name=index_field.name, dtype=index_dtype) elif len(schema.primary_key) > 1: index = pd.MultiIndex.from_tuples(index_rows, names=schema.primary_key) # Create dtypes/columns dtypes = [] columns = [] for field in schema.fields: if field.name not in schema.primary_key: field_name = field.name if six.PY2: field_name = field.name.encode('utf-8') dtype = self.convert_type(jtstypes_map.get(field.name, field.type)) dtypes.append((field_name, dtype)) columns.append(field.name) # Create dataframe array = np.array(data_rows, dtype=dtypes) dataframe = pd.DataFrame(array, index=index, columns=columns) return dataframe def convert_type(self, type): """Convert type to Pandas """ # Mapping mapping = { 'any': np.dtype('O'), 'array': np.dtype(list), 'boolean': np.dtype(bool), 'date': np.dtype('O'), 'datetime': np.dtype('datetime64[ns]'), 'duration': np.dtype('O'), 'geojson': np.dtype('O'), 'geopoint': np.dtype('O'), 'integer': np.dtype(int), 'number': np.dtype(float), 'object': np.dtype(dict), 'string': np.dtype('O'), 'time': np.dtype('O'), 'year': np.dtype(int), 'yearmonth': np.dtype('O'), } # Get type if type not in mapping: message = 'Type "%s" is not supported' % type raise tableschema.exceptions.StorageError(message) return mapping[type] def restore_descriptor(self, dataframe): """Restore descriptor from Pandas """ # Prepare fields = [] primary_key = None # Primary key if dataframe.index.name: field_type = self.restore_type(dataframe.index.dtype) field = { 'name': dataframe.index.name, 'type': field_type, 'constraints': {'required': True}, } fields.append(field) primary_key = dataframe.index.name # Fields for column, dtype in dataframe.dtypes.iteritems(): sample = dataframe[column].iloc[0] if len(dataframe) else None field_type = self.restore_type(dtype, sample=sample) field = {'name': column, 'type': field_type} # TODO: provide better required indication # if dataframe[column].isnull().sum() == 0: # field['constraints'] = {'required': True} fields.append(field) # Descriptor descriptor = {} descriptor['fields'] = fields if primary_key: descriptor['primaryKey'] = primary_key return descriptor def restore_row(self, row, schema, pk): """Restore row from Pandas """ result = [] for field in schema.fields: if schema.primary_key and schema.primary_key[0] == field.name: if field.type == 'number' and np.isnan(pk): pk = None if pk and field.type == 'integer': pk = int(pk) result.append(field.cast_value(pk)) else: value = row[field.name] if field.type == 'number' and np.isnan(value): value = None if value and field.type == 'integer': value = int(value) elif field.type == 'datetime': value = value.to_pydatetime() result.append(field.cast_value(value)) return result def restore_type(self, dtype, sample=None): """Restore type from Pandas """ # Pandas types if pdc.is_bool_dtype(dtype): return 'boolean' elif pdc.is_datetime64_any_dtype(dtype): return 'datetime' elif pdc.is_integer_dtype(dtype): return 'integer' elif pdc.is_numeric_dtype(dtype): return 'number' # Python types if sample is not None: if isinstance(sample, (list, tuple)): return 'array' elif isinstance(sample, datetime.date): return 'date' elif isinstance(sample, isodate.Duration): return 'duration' elif isinstance(sample, dict): return 'object' elif isinstance(sample, six.string_types): return 'string' elif isinstance(sample, datetime.time): return 'time' return 'string'