import csv import sys import numpy as np import pandas as pd from collections import OrderedDict from toolz import itemmap, keymap, valmap from .utils import decode_escapes, decode_array MAPPING = {'object': 'String', 'uint64': 'UInt64', 'uint32': 'UInt32', 'uint16': 'UInt16', 'uint8': 'UInt8', 'float64': 'Float64', 'float32': 'Float32', 'int64': 'Int64', 'int32': 'Int32', 'int16': 'Int16', 'int8': 'Int8', 'datetime64[D]': 'Date', 'datetime64[ns]': 'DateTime'} PD2CH = keymap(np.dtype, MAPPING) CH2PD = itemmap(reversed, MAPPING) CH2PD['Null'] = 'object' CH2PD['Nothing'] = 'object' NULLABLE_COLS = ['UInt64', 'UInt32', 'UInt16', 'UInt8', 'Float64', 'Float32', 'Int64', 'Int32', 'Int16', 'Int8', 'String'] for col in NULLABLE_COLS: CH2PD['Nullable({})'.format(col)] = CH2PD[col] PY3 = sys.version_info[0] == 3 def normalize(df, index=True): if index: df = df.reset_index() for col in df.select_dtypes([bool]): df[col] = df[col].astype('uint8') dtypes = valmap(PD2CH.get, OrderedDict(df.dtypes)) if None in dtypes.values(): raise ValueError('Unknown type mapping in dtypes: {}'.format(dtypes)) return dtypes, df def to_csv(df): data = df.to_csv(header=False, index=False, encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC, escapechar='\\') if PY3: return data.encode('utf-8') else: return data def to_dataframe(lines, **kwargs): names = lines.readline().decode('utf-8').strip().split('\t') types = lines.readline().decode('utf-8').strip().split('\t') dtypes, parse_dates, converters = {}, [], {} for name, chtype in zip(names, types): dtype = CH2PD.get(chtype, 'object') if chtype.startswith("Array("): converters[name] = decode_array elif dtype == 'object': converters[name] = decode_escapes elif dtype.startswith('datetime'): parse_dates.append(name) else: dtypes[name] = dtype return pd.read_csv(lines, sep='\t', header=None, names=names, dtype=dtypes, parse_dates=parse_dates, converters=converters, na_values=set(), keep_default_na=False, **kwargs) def partition(df, chunksize=1000): nrows = df.shape[0] nchunks = int(nrows / chunksize) + 1 for i in range(nchunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break chunk = df.iloc[start_i:end_i] yield chunk