# -*- coding: utf-8 -*- """pyjstat is a python module for JSON-stat formatted data manipulation. This module allows reading and writing JSON-stat [1]_ format with python, using data frame structures provided by the widely accepted pandas library [2]_. The JSON-stat format is a simple lightweight JSON format for data dissemination. Pyjstat is inspired in rjstat [3]_, a library to read and write JSON-stat with R, by ajschumacher. pyjstat is written and maintained by `Miguel Expósito Martín <https://twitter.com/predicador37>`_ and is distributed under the Apache 2.0 License (see LICENSE file). .. [1] http://json-stat.org/ for JSON-stat information .. [2] http://pandas.pydata.org for Python Data Analysis Library information .. [3] https://github.com/ajschumacher/rjstat for rjstat library information Example: Importing a JSON-stat file into a pandas data frame can be done as follows:: import urllib2 import json import pyjstat results = pyjstat.from_json_stat(json.load(urllib2.urlopen( 'http://json-stat.org/samples/oecd-canada.json'))) print results """ import inspect import json import logging import warnings from collections import OrderedDict from datetime import datetime import numpy as np import pandas as pd import requests logging.basicConfig(level=logging.INFO) LOGGER = logging.getLogger(__name__) try: basestring except NameError: basestring = str class NumpyEncoder(json.JSONEncoder): """Custom JSON encoder class for Numpy data types.""" def default(self, obj): """Encode by default.""" if isinstance(obj, np.integer) or isinstance(obj, np.int64): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() else: return super(NumpyEncoder, self).default(obj) def to_int(variable): """Convert variable to integer or string depending on the case. Args: variable (string): a string containing a real string or an integer. Returns: variable(int, string): an integer or a string, depending on the content of variable. """ try: return int(variable) except ValueError: return variable def to_str(variable): """Convert variable to integer or string depending on the case. Args: variable (string): a string containing a real string or an integer. Returns: variable(int, string): an integer or a string, depending on the content of variable. """ try: int(variable) return str(variable) except ValueError: return variable def check_version_2(dataset): """Check for json-stat version. Check if json-stat version attribute exists and is equal or greater than 2.0 for a given dataset. Args: dataset (OrderedDict): data in JSON-stat format, previously deserialized to a python object by json.load() or json.loads(), Returns: bool: True if version exists and is equal or greater than 2.0, False otherwise. For datasets without the version attribute, always return False. """ if float(dataset.get('version')) >= 2.0 \ if dataset.get('version') else False: return True else: return False def unnest_collection(collection, df_list): """Unnest collection extracting its datasets and converting them to df. Args: collection (OrderedDict): data in JSON-stat format, previously deserialized to a python object by json.load() or json.loads(), df_list (list): list variable which will contain the converted datasets. Returns: Nothing. """ for item in collection['link']['item']: if item['class'] == 'dataset': df_list.append(Dataset.read(item['href']).write('dataframe')) elif item['class'] == 'collection': nested_collection = request(item['href']) unnest_collection(nested_collection, df_list) def check_input(naming): """Check and validate input params. Args: naming (string): a string containing the naming type (label or id). Returns: Nothing Raises: ValueError: if the parameter is not in the allowed list. """ if naming not in ['label', 'id']: raise ValueError('naming must be "label" or "id"') def get_dimensions(js_dict, naming): """Get dimensions from input data. Args: js_dict (dict): dictionary containing dataset data and metadata. naming (string, optional): dimension naming. Possible values: 'label' or 'id'. Returns: dimensions (list): list of pandas data frames with dimension category data. dim_names (list): list of strings with dimension names. """ dimensions = [] dim_names = [] if check_version_2(js_dict): dimension_dict = js_dict else: dimension_dict = js_dict['dimension'] for dim in dimension_dict['id']: dim_name = js_dict['dimension'][dim]['label'] if not dim_name: dim_name = dim if naming == 'label': dim_label = get_dim_label(js_dict, dim) dimensions.append(dim_label) dim_names.append(dim_name) else: dim_index = get_dim_index(js_dict, dim) dimensions.append(dim_index) dim_names.append(dim) return dimensions, dim_names def get_dim_label(js_dict, dim, dim_input="dataset"): """Get label from a given dimension. Args: js_dict (dict): dictionary containing dataset data and metadata. dim (string): dimension name obtained from JSON file. Returns: dim_label(pandas.DataFrame): DataFrame with label-based dimension data. """ if dim_input == 'dataset': dim_input = js_dict['dimension'][dim] label_col = 'label' elif dim_input == 'dimension': label_col = js_dict['label'] dim_input = js_dict else: raise ValueError try: dim_label = dim_input['category']['label'] except KeyError: dim_index = get_dim_index(js_dict, dim) dim_label = pd.concat([dim_index['id'], dim_index['id']], axis=1) dim_label.columns = ['id', 'label'] else: dim_label = pd.DataFrame(list(zip(dim_label.keys(), dim_label.values())), index=dim_label.keys(), columns=['id', label_col]) # index must be added to dim label so that it can be sorted try: dim_index = dim_input['category']['index'] except KeyError: dim_index = pd.DataFrame(list(zip([dim_label['id'][0]], [0])), index=[0], columns=['id', 'index']) else: if type(dim_index) is list: dim_index = pd.DataFrame(list(zip(dim_index, range(0, len(dim_index)))), index=dim_index, columns=['id', 'index']) else: dim_index = pd.DataFrame(list(zip(dim_index.keys(), dim_index.values())), index=dim_index.keys(), columns=['id', 'index']) dim_label = pd.merge(dim_label, dim_index, on='id').sort_values(by='index') return dim_label def get_dim_index(js_dict, dim): """Get index from a given dimension. Args: js_dict (dict): dictionary containing dataset data and metadata. dim (string): dimension name obtained from JSON file. Returns: dim_index (pandas.DataFrame): DataFrame with index-based dimension data. """ try: dim_index = js_dict['dimension'][dim]['category']['index'] except KeyError: dim_label = get_dim_label(js_dict, dim) dim_index = pd.DataFrame(list(zip([dim_label['id'][0]], [0])), index=[0], columns=['id', 'index']) else: if type(dim_index) is list: dim_index = pd.DataFrame(list(zip(dim_index, range(0, len(dim_index)))), index=dim_index, columns=['id', 'index']) else: dim_index = pd.DataFrame(list(zip(dim_index.keys(), dim_index.values())), index=dim_index.keys(), columns=['id', 'index']) dim_index = dim_index.sort_values(by='index') return dim_index def get_values(js_dict, value='value'): """Get values from input data. Args: js_dict (dict): dictionary containing dataset data and metadata. value (string, optional): name of the value column. Defaults to 'value'. Returns: values (list): list of dataset values. """ values = js_dict[value] if type(values) is list: if type(values[0]) is not dict or tuple: return values # being not a list of dicts or tuples leaves us with a dict... values = {int(key): value for (key, value) in values.items()} if js_dict.get('size'): max_val = np.prod(np.array((js_dict['size']))) else: max_val = np.prod(np.array((js_dict['dimension']['size']))) vals = max_val * [None] for (key, value) in values.items(): vals[key] = value values = vals return values def get_df_row(dimensions, naming='label', i=0, record=None): """Generate row dimension values for a pandas dataframe. Args: dimensions (list): list of pandas dataframes with dimension labels generated by get_dim_label or get_dim_index methods. naming (string, optional): dimension naming. Possible values: 'label' or 'id'. i (int): dimension list iteration index. Default is 0, it's used in the recursive calls to the method. record (list): list of values representing a pandas dataframe row, except for the value column. Default is empty, it's used in the recursive calls to the method. Yields: list: list with pandas dataframe column values except for value column """ check_input(naming) if i == 0 or record is None: record = [] for dimension in dimensions[i][naming]: record.append(dimension) if len(record) == len(dimensions): yield record if i + 1 < len(dimensions): for row in get_df_row(dimensions, naming, i + 1, record): yield row if len(record) == i + 1: record.pop() def uniquify(seq): """Return unique values in a list in the original order. See: http://www.peterbe.com/plog/uniqifiers-benchmark Args: seq (list): original list. Returns: list: list without duplicates preserving original order. """ seen = set() seen_add = seen.add return [x for x in seq if x not in seen and not seen_add(x)] def generate_df(js_dict, naming, value="value"): """Decode JSON-stat dict into pandas.DataFrame object. Helper method that should be called inside from_json_stat(). Args: js_dict(OrderedDict): OrderedDict with data in JSON-stat format, previously deserialized into a python object by json.load() or json.loads(), for example. naming(string): dimension naming. Possible values: 'label' or 'id.' value (string, optional): name of the value column. Defaults to 'value'. Returns: output(DataFrame): pandas.DataFrame with converted data. """ values = [] dimensions, dim_names = get_dimensions(js_dict, naming) values = get_values(js_dict, value=value) output = pd.DataFrame([category + [values[i]] for i, category in enumerate(get_df_row(dimensions, naming))]) output.columns = dim_names + [value] output.index = range(0, len(values)) return output def from_json_stat(datasets, naming='label', value='value'): """Decode JSON-stat formatted data into pandas.DataFrame object. Args: datasets(OrderedDict, list): data in JSON-stat format, previously deserialized to a python object by json.load() or json.loads(), for example. Both List and OrderedDict are accepted as inputs. naming(string, optional): dimension naming. Possible values: 'label' or 'id'.Defaults to 'label'. value (string, optional): name of the value column. Defaults to 'value'. Returns: results(list): list of pandas.DataFrame with imported data. """ warnings.warn( "Shouldn't use this function anymore! Now use read() methods of" "Dataset, Collection or Dimension.", DeprecationWarning ) check_input(naming) results = [] if type(datasets) is list: for idx, element in enumerate(datasets): for dataset in element: js_dict = datasets[idx][dataset] results.append(generate_df(js_dict, naming, value)) elif isinstance(datasets, OrderedDict) or type(datasets) is dict or \ isinstance(datasets, Dataset): if 'class' in datasets: if datasets['class'] == 'dataset': js_dict = datasets results.append(generate_df(js_dict, naming, value)) else: # 1.00 bundle type for dataset in datasets: js_dict = datasets[dataset] results.append(generate_df(js_dict, naming, value)) return results def to_json_stat(input_df, value='value', output='list', version='1.3', updated=datetime.today(), source='Self-elaboration'): """Encode pandas.DataFrame object into JSON-stat format. The DataFrames must have exactly one value column. Args: df(pandas.DataFrame): pandas data frame (or list of data frames) to encode. value (string, optional): name of the value column. Defaults to 'value'. output(string): accepts two values: 'list' or 'dict'. Produce list of dicts or dict of dicts as output. version(string): desired json-stat version. 2.0 is preferred now. Apart from this, only older 1.3 format is accepted, which is the default parameter in order to preserve backwards compatibility. updated(datetime): updated metadata in JSON-stat standard. Must be a datetime in ISO format. source(string): data source in JSON-stat standard. Returns: output(string): String with JSON-stat object. """ warnings.warn( "Shouldn't use this function anymore! Now use write() methods of" "Dataset, Collection or Dimension.", DeprecationWarning ) data = [] if output == 'list': result = [] elif output == 'dict': result = OrderedDict({}) if isinstance(input_df, pd.DataFrame): data.append(input_df) else: data = input_df for row, dataframe in enumerate(data): dims = data[row].filter([item for item in data[row].columns.values if item not in value]) if len(dims.columns.values) != len(set(dims.columns.values)): raise ValueError('Non-value columns must constitute a unique ID') dim_names = list(dims) categories = [{to_int(i): {"label": to_str(i), "category": {"index": OrderedDict([(to_str(j), to_int(k)) for k, j in enumerate(uniquify(dims[i]))]), "label": OrderedDict([(to_str(j), to_str(j)) for k, j in enumerate(uniquify(dims[i]))])}}} for i in dims.columns.values] if float(version) >= 2.0: dataset = {"dimension": OrderedDict(), value: [None if pd.isnull(x) else x for x in dataframe[value].values]} dataset["version"] = version dataset["class"] = "dataset" dataset["updated"] = updated.isoformat() dataset["source"] = source for category in categories: dataset["dimension"].update(category) dataset.update({"id": dim_names}) dataset.update({"size": [len(dims[i].unique()) for i in dims.columns.values]}) for category in categories: dataset["dimension"].update(category) else: dataset = {"dataset" + str(row + 1): {"dimension": OrderedDict(), value: [None if pd.isnull(x) else x for x in dataframe[value].values]}} for category in categories: dataset["dataset" + str(row + 1)][ "dimension"].update(category) dataset["dataset" + str(row + 1)][ "dimension"].update({"id": dim_names}) dataset["dataset" + str(row + 1)][ "dimension"].update({"size": [len(dims[i].unique()) for i in dims.columns.values]}) for category in categories: dataset["dataset" + str(row + 1)][ "dimension"].update(category) if output == 'list': result.append(dataset) elif output == 'dict': result.update(dataset) else: result = None return json.dumps(result, cls=NumpyEncoder) def request(path, verify=True): """Send a request to a given URL accepting JSON format. Args: path (str): The URI to be requested. Returns: response: Deserialized JSON Python object. Raises: HTTPError: the HTTP error returned by the requested server. InvalidURL: an invalid URL has been requested. Exception: generic exception. """ headers = {'Accept': 'application/json'} try: requested_object = requests.get(path, headers=headers, verify=verify) requested_object.raise_for_status() except requests.exceptions.HTTPError as exception: LOGGER.error((inspect.stack()[0][3]) + ': HTTPError = ' + str(exception.response.status_code) + ' ' + str(exception.response.reason) + ' ' + str(path)) raise except requests.exceptions.InvalidURL as exception: LOGGER.error('URLError = ' + str(exception.reason) + ' ' + str(path)) raise except Exception: import traceback LOGGER.error('Generic exception: ' + traceback.format_exc()) raise else: response = requested_object.json() return response class Dataset(OrderedDict): """A class representing a JSONstat dataset.""" def __init__(self, *args, **kwargs): """Initialize object.""" super(Dataset, self).__init__(*args, **kwargs) @classmethod def read(cls, data, verify=True, **kwargs): """Read data from URL, Dataframe, JSON string/file or OrderedDict. Args: data: can be a Pandas Dataframe, a JSON file, a JSON string, an OrderedDict or a URL pointing to a JSONstat file. verify: whether to host's SSL certificate. kwargs: optional arguments for to_json_stat(). Returns: An object of class Dataset populated with data. """ if isinstance(data, pd.DataFrame): return cls((json.loads( to_json_stat(data, output='dict', version='2.0', **kwargs), object_pairs_hook=OrderedDict))) elif isinstance(data, OrderedDict): return cls(data) elif (isinstance(data, basestring) and data.startswith(("http://", "https://", "ftp://", "ftps://"))): # requests will do the rest... return cls(request(data, verify=verify)) elif isinstance(data, basestring): try: json_dict = json.loads(data, object_pairs_hook=OrderedDict) return cls(json_dict) except ValueError: raise else: try: json_dict = json.load(data, object_pairs_hook=OrderedDict) return cls(json_dict) except ValueError: raise def write(self, output='jsonstat', naming="label", value='value'): """Write data from a Dataset object to JSONstat or Pandas Dataframe. Args: output(string): can accept 'jsonstat' or 'dataframe'. Default to 'jsonstat'. naming (string): optional, ingored if output = 'jsonstat'. Dimension naming. Possible values: 'label' or 'id'. Defaults to 'label'. value (string): optional, ignored if output = 'jsonstat'. Name of value column. Defaults to 'value'. Returns: Serialized JSONstat or a Pandas Dataframe,depending on the 'output' parameter. """ if output == 'jsonstat': return json.dumps(OrderedDict(self), cls=NumpyEncoder) elif output == 'dataframe': return from_json_stat(self, naming=naming, value=value)[0] else: raise ValueError("Allowed arguments are 'jsonstat' or 'dataframe'") def get_dimension_index(self, name, value): """Get a dimension index from its name. Convert a dimension ID string and a category ID string into the numeric index of that category in that dimension Args: name(string): ID string of the dimension. value(string): ID string of the category. Returns: ndx[value](int): index of the category in the dimension. """ if 'index' not in self.get('dimension', {}). \ get(name, {}).get('category', {}): return 0 ndx = self['dimension'][name]['category']['index'] if isinstance(ndx, list): return ndx.index(value) else: return ndx[value] def get_dimension_indices(self, query): """Get dimension indices. Converts a dimension/category list of dicts into a list of dimension indices. Args: query(list): dimension/category list of dicts. Returns: indices(list): list of dimensions' indices. """ ids = self['id'] if self.get('id') else self['dimension']['id'] indices = [] for idx, ident in enumerate(ids): indices.append(self.get_dimension_index( ident, [d.get(ident) for d in query if ident in d][0])) return indices def get_value_index(self, indices): """Convert a list of dimension indices into a numeric value index. Args: indices(list): list of dimension's indices. Returns: num(int): numeric value index. """ size = self['size'] if self.get('size') else self['dimension']['size'] ndims = len(size) mult = 1 num = 0 for idx, dim in enumerate(size): mult *= size[ndims - idx] if (idx > 0) else 1 num += mult * indices[ndims - idx - 1] return num def get_value_by_index(self, index): """Convert a numeric value index into its data value. Args: index(int): numeric value index. Returns: self['value'][index](float): Numeric data value. """ return self['value'][index] def get_value(self, query): """Get data value. Convert a dimension/category list of dicts into a data value in three steps. Args: query(list): list of dicts with the desired query. Returns: value(float): numeric data value. """ indices = self.get_dimension_indices(query) index = self.get_value_index(indices) value = self.get_value_by_index(index) return value class Dimension(OrderedDict): """A class representing a JSONstat dimension.""" def __init__(self, *args, **kwargs): """Initialize object.""" super(Dimension, self).__init__(*args, **kwargs) @classmethod def read(cls, data): """Read data from URL, Dataframe, JSON string/file or OrderedDict. Args: data: can be a Pandas Dataframe, a JSON string, a JSON file, an OrderedDict or a URL pointing to a JSONstat file. Returns: An object of class Dimension populated with data. """ if isinstance(data, pd.DataFrame): output = OrderedDict({}) output['version'] = '2.0' output['class'] = 'dimension' [label] = [x for x in list(data.columns.values) if x not in ['id', 'index']] output['label'] = label output['category'] = OrderedDict({}) output['category']['index'] = data.id.tolist() output['category']['label'] = OrderedDict( zip(data.id.values, data[label].values)) return cls(output) elif isinstance(data, OrderedDict): return cls(data) elif isinstance(data, basestring) and data.startswith(("http://", "https://", "ftp://", "ftps://")): return cls(request(data)) elif isinstance(data, basestring): try: json_dict = json.loads(data, object_pairs_hook=OrderedDict) return cls(json_dict) except ValueError: raise else: try: json_dict = json.load(data, object_pairs_hook=OrderedDict) return cls(json_dict) except ValueError: raise def write(self, output='jsonstat'): """Write data from a Dataset object to JSONstat or Pandas Dataframe. Args: output(string): can accept 'jsonstat' or 'dataframe' Returns: Serialized JSONstat or a Pandas Dataframe,depending on the 'output' parameter. """ if output == 'jsonstat': return json.dumps(OrderedDict(self), cls=NumpyEncoder) elif output == 'dataframe': return get_dim_label(self, self['label'], 'dimension') else: raise ValueError("Allowed arguments are 'jsonstat' or 'dataframe'") class Collection(OrderedDict): """A class representing a JSONstat collection.""" def __init__(self, *args, **kwargs): """Initialize object.""" super(Collection, self).__init__(*args, **kwargs) @classmethod def read(cls, data): """Read data from URL or OrderedDict. Args: data: can be a URL pointing to a JSONstat file, a JSON string or an OrderedDict. Returns: An object of class Collection populated with data. """ if isinstance(data, OrderedDict): return cls(data) elif isinstance(data, basestring) and data.startswith(("http://", "https://", "ftp://", "ftps://")): return cls(request(data)) elif isinstance(data, basestring): try: json_dict = json.loads(data, object_pairs_hook=OrderedDict) return cls(json_dict) except ValueError: raise else: try: json_dict = json.load(data, object_pairs_hook=OrderedDict) return cls(json_dict) except ValueError: raise def write(self, output='jsonstat'): """Write to JSON-stat or list of df. Writes data from a Collection object to JSONstat or list of Pandas Dataframes. Args: output(string): can accept 'jsonstat' or 'dataframe_list' Returns: Serialized JSONstat or a list of Pandas Dataframes,depending on the 'output' parameter. """ if output == 'jsonstat': return json.dumps(self) elif output == 'dataframe_list': df_list = [] unnest_collection(self, df_list) return df_list else: raise ValueError( "Allowed arguments are 'jsonstat' or 'dataframe_list'") def get(self, element): """Get element from collection. Get ith element from a collection in an object of the corresponding class. Args: output(string): can accept 'jsonstat' or 'dataframe_list' Returns: Serialized JSONstat or a list of Pandas Dataframes,depending on the 'output' parameter. """ if self['link']['item'][element]['class'] == 'dataset': return Dataset.read(self['link']['item'][element]['href']) elif self['link']['item'][element]['class'] == 'collection': return Collection.read(self['link']['item'][element]['href']) elif self['link']['item'][element]['class'] == 'dimension': return Dimension.read(self['link']['item'][element]['href']) else: raise ValueError( "Class not allowed. Please use dataset, collection or " "dimension'")