python source code of export

# coding: utf8
"""
This module intends to transform a json (schema, mapping, ...) and write it in different file types.

Entry point is write_output_dict.

It relies on a class system that defines how to open and write into the different file types.

The base class is BaseOutput, it defines the mechanism.
abstract property : output_format
abstract method: write_data
Those must be overridden in the final subclasses.
The opener method should be overridden as well if open method is not enough
(to manage non ascii for example).

It is inherited by two base classes, that represent two groups of outputs:
HierarchicalOutput for nested formats (yaml and json)
ListOutput for table like formats (tsv, md, html - since this format displays a table, xlsx).
They intend to preprocess data as this is common to each group of output.
They use OutputPreProcessing class to deal with this preprocessing.
This class is a factory that will allow to use the right preprocessing methods
depending on the category treated (schema, mapping, ...).

Then those base classes are used (inherited from) to define each format:
JsonOutput, YamlOutput, TsvOutput, HtmlOutput, MdOutput, XlsxOutput
"""
import abc
import codecs
import csv
import json
import logging
import os
import re
import sys
from contextlib import contextmanager
from functools import partial
from numbers import Number

import yaml
import jinja2
from bson import json_util
from future.moves.collections import OrderedDict
from past.builtins import basestring
import pandas as pd
from openpyxl import load_workbook

logger = logging.getLogger(__name__)


class BaseOutput(object):
    """
    Abstract base class. Defines the mechanism allowing to write data into various file formats.

    The principle is to be able to define how to open the file and how to write into it
    using the same interface no matter what format is asked.

    Abstract methods to override:
    property output_format (can be class attribute): string representing the format expected
        ('json', 'yaml', 'tsv', 'html', 'md, 'xlsx', ... it should match file extension)
    write_data: define how to write into the object given by the open method

    Public methods that can be overridden to specialize the output:
    opener
    closer

    Other public method (should not be overridden):
    open: context manager that yields the file ready to be written into by write_data.
        It uses private methods opener and closer to define how to open and close the file
    """
    __metaclass__ = abc.ABCMeta

    @property
    @abc.abstractmethod
    def output_format(self):
        """Expected string matching the extension of the file format."""
        pass

    @contextmanager
    def open(self, filename):
        """Yields a file descriptor as expected in self.write_data"""
        if not filename:  # output is stdout, opener and closer must be adapted
            self.opener = self._stdout_opener
            self.closer = self._stdout_closer
        else:
            if not filename.endswith('.' + self.output_format):  # Add extension
                filename += '.' + self.output_format
        file_descr = self.opener()(filename)
        try:
            yield file_descr
        finally:
            self.closer(file_descr)

    def opener(self):
        """Return the function used to open the file (only filename will be passed as argument)."""
        return partial(open, mode='w')

    def _stdout_opener(self):
        """Mock opener if output is stdout"""
        return lambda x: sys.stdout

    def closer(self, file_descr):
        """Actions to perform to close the file."""
        try:
            file_descr.close()
        except AttributeError:
            pass

    def _stdout_closer(self, file_descr):
        """Mock closer if output is stdout"""
        pass

    @abc.abstractmethod
    def write_data(self, file_descr):
        """Expected function that writes into file_descr - object yielded by the open method."""
        pass


class OutputPreProcessing(object):
    """
    Abstract base class used to define how to preprocess the data depending on the category.

    Abstract methods to override:
    property category: string - specifies what category the child is managing (schema, mapping, ...)
    property default_columns: list - name of columns to display in table like outputs
    convert_to_dataframe: convert data into a dataframe - used for list outputs

    Public method that should be overridden:
    columns_values_makers: dict - indicate how to extract data for each columns

    Public method that can be overridden:
    filter_data: preprocess (filter) output data for hierarchical outputs

    Public methods that should not be overridden:
    regularize_column_name: reformat column name to fill columns_values_makers format
    make_column_value: use columns_values_makers to extract column data
    """
    __metaclass__ = abc.ABCMeta

    @property
    @abc.abstractmethod
    def category(self):
        """String - category to manage (schema, mapping, ...)"""
        pass

    @property
    @abc.abstractmethod
    def default_columns(self):
        """List of names of columns to display."""
        pass

    def __new__(cls, category):
        return object.__new__(rec_find_right_subclass(category, attribute='category',
                                                      start_class=cls))

    @classmethod
    def columns_values_makers(cls):
        """{column snake case name: function to extract columns data}"""
        return {}

    @classmethod
    def regularize_column_name(cls, column_name):
        """Column names are stored snake case."""
        return column_name.lower().replace(" ", "_")

    @classmethod
    def make_column_value(cls, column, data, *infos):
        """Call the right function in columns_values_makers to make column value based on infos."""
        column = cls.regularize_column_name(column)
        if column in cls.columns_values_makers():
            return cls.printable_value(cls.columns_values_makers()[column](data, *infos))
        return cls.printable_value(data.get(column))

    @classmethod
    @abc.abstractmethod
    def convert_to_dataframe(cls, data, **kwargs):
        """Create a dataframe from data"""
        return

    @classmethod
    def filter_data(cls, data):
        """Basic method just return data - can be overridden to filter (return json)"""
        return data

    @staticmethod
    def printable_value(value):
        """Helper to manage how to display objects (use json.dumps if possible)"""
        if isinstance(value, (Number, basestring, type(None))):
            return value
        try:
            return json.dumps(value, sort_keys=True)
        except (ValueError, TypeError):
            return str(value)


class _MappingPreProcessing(OutputPreProcessing):
    """Preprocess 'mapping' data from to_sql module"""
    category = 'mapping'
    default_columns = ['Field_name', 'Description', 'Type']

    @classmethod
    def columns_values_makers(cls):
        """How to extract columns data based on field mapping and field name"""
        return {
            # f for field
            'field_name': lambda f_dict, f_name: f_dict['dest'] if 'dest' in f_dict else f_name,
            'type': lambda f_dict, f_name: f_dict['type'],
        }

    @classmethod
    def convert_to_dataframe(cls, data, columns_to_get=None, **kwargs):
        """Transform data (mapping dict) into a dataframe."""
        columns_to_get = columns_to_get or cls.default_columns
        lines = []
        for db in sorted(data):
            for table in sorted(data[db]):
                lines += cls._table_dict_to_lines(db, table, data[db][table], columns_to_get)

        header = ['Database', 'Table'] + columns_to_get
        return pd.DataFrame(lines, columns=header)

    @classmethod
    def _table_dict_to_lines(cls, db_name, table_name, table_dict, columns_to_get):
        """Transform mapping table dict into a list, filtering on columns_to_get."""
        lines = []
        for field_name, field_dict in sorted(table_dict.items(), key=lambda x: x[0]):
            if field_name in ['_id', 'pk']:
                continue
            lines.append([db_name, table_name] +
                         [cls.make_column_value(col_name, field_dict, field_name)
                          for col_name in columns_to_get])
        return lines


class _DiffPreProcessing(OutputPreProcessing):
    """Preprocess 'diff' data from compare module"""
    category = 'diff'
    default_columns = ['Hierarchy', 'Previous Schema', 'New Schema']

    @classmethod
    def columns_values_makers(cls):
        """How to extract columns data based on diff dict and hierarchy"""
        return {
            'hierarchy': lambda diff, hierarchy: '.'.join(hierarchy),
            'previous_schema': lambda diff, hierarchy: diff['prev_schema']
        }

    @classmethod
    def convert_to_dataframe(cls, data, columns_to_get=None, **kwargs):
        """Transform data (list of dicts) into a dataframe."""
        columns_to_get = columns_to_get or cls.default_columns

        table = []
        for d in data:
            hierarchy = d['hierarchy'].split('.')
            db = hierarchy.pop(0)
            coll = hierarchy.pop(0) if hierarchy else ''

            table.append([db, coll] +
                         [cls.make_column_value(col_name, d, hierarchy)
                          for col_name in columns_to_get])

        header = ['Database', 'Collection'] + columns_to_get
        return pd.DataFrame(table, columns=header)


class _SchemaPreProcessing(OutputPreProcessing):
    """Prepocess mongo schema"""
    category = 'schema'
    default_columns = ['Field_full_name', 'Depth', 'Field_name', 'Type']

    @classmethod
    def columns_values_makers(cls):
        """How to extract columns data based on field schema, name and prefix."""
        return {  # 'f' for field
            'field_full_name': lambda f_schema, f, f_prefix: f_prefix + f,
            'field_compact_name': cls._field_compact_name,
            'field_name': lambda f_schema, f, f_prefix: f,
            'depth': cls._field_depth,
            'type': cls._field_type,
            'percentage': lambda f_schema, f, f_prefix: (100 * f_schema['prop_in_object']
                                                         if 'prop_in_object' in f_schema else None),
            'types_count': lambda f_schema, f, f_prefix: cls._format_types_count(
                f_schema.get('types_count', None), f_schema.get('array_types_count', None)),
        }

    @classmethod
    def filter_data(cls, data):
        """ Recursively copy schema without count fields.

        :param data: schema or subpart of schema
        :return dict (subpart of schema without count fields) or original value
        """
        if isinstance(data, dict):
            schema_filtered = dict()
            for k, v in data.items():
                if k not in ['count', 'types_count', 'prop_in_object', 'array_types_count']:
                    schema_filtered[k] = cls.filter_data(v)
            return schema_filtered
        return data

    @classmethod
    def convert_to_dataframe(cls, data, columns_to_get=None, **kwargs):
        """
        Load schema (data) into dataframe, filtering on columns_to_get (column names list).
        """
        columns_to_get = columns_to_get or cls.default_columns
        line_tuples = list()
        for database, database_schema in sorted(list(data.items())):
            for collection, collection_schema in sorted(list(database_schema.items())):
                collection_line_tuples = cls._object_schema_to_line_tuples(
                    collection_schema['object'], columns_to_get, field_prefix='')
                for line in collection_line_tuples:
                    line_tuples.append([database, collection] + list(line))

        header = tuple(['Database', 'Collection'] + columns_to_get)
        return pd.DataFrame(line_tuples, columns=header)

    @classmethod
    def _object_schema_to_line_tuples(cls, object_schema, columns_to_get, field_prefix):
        """ Get the list of tuples describing lines in object_schema

        - Sort fields by count
        - Add the tuples describing each field in object
        - Recursively add tuples for nested objects

        :param object_schema: dict
        :param columns_to_get: iterable
            columns to create for each field
        :param field_prefix: str, default ''
            allows to create full name.
            '.' is the separator for object subfields
            ':' is the separator for list of objects subfields
        :return line_tuples: list of tuples describing lines
        """
        line_tuples = []
        sorted_fields = sorted(list(object_schema.items()),
                               key=lambda x: (-x[1]['count'], x[0]) if 'count' in x[1] else x[0])

        for field, field_schema in sorted_fields:
            line_columns = cls._field_schema_to_columns(
                field, field_schema, field_prefix, columns_to_get)
            line_tuples.append(line_columns)

            types = field_schema.get('types_count', [field_schema['type']])

            if 'object' in field_schema:
                if 'ARRAY' in types:
                    current_prefix = field_prefix + field + ':'
                elif 'OBJECT' in types:
                    current_prefix = field_prefix + field + '.'
                else:
                    logger.warning('Field {} has key "object" but has types {} while should have '
                                   '"OBJECT" or "ARRAY"'.format(field, types))
                    continue
                line_tuples += cls._object_schema_to_line_tuples(
                    field_schema['object'], columns_to_get, field_prefix=current_prefix)

        return line_tuples

    @classmethod
    def _field_schema_to_columns(cls, field_name, field_schema, field_prefix, columns_to_get):
        """ Given fields information, returns a tuple representing columns_to_get.

        :param field_name:
        :param field_schema:
        :param field_prefix: str, default ''
        :param columns_to_get: iterable
            columns to create for each field
        :return field_columns: tuple
        """
        field_columns = list()
        for column in columns_to_get:
            field_columns.append(cls.make_column_value(column, field_schema, field_name, field_prefix))

        return tuple(field_columns)

    @staticmethod
    def _field_compact_name(field_schema, field_name, field_prefix):
        """ Return a compact version of field name, without parent object names.

        >>> field_compact_name(None, 'baz', 'foo.bar:')
        " .  : baz"
        """
        separators = re.sub('[^.:]', '', field_prefix)
        separators = re.sub('\.', ' . ', separators)
        separators = re.sub(':', ' : ', separators)
        return separators + field_name

    @staticmethod
    def _field_depth(field_schema, field_name, field_prefix):
        """ Return the level of imbrication of a field."""
        separators = re.sub('[^.:]', '', field_prefix)
        return len(separators)

    @staticmethod
    def _field_type(field_schema, field_name, field_prefix):
        """ Return a string describing the type of a field."""
        f_type = field_schema['type']
        if f_type == 'ARRAY':
            f_type = 'ARRAY(' + field_schema['array_type'] + ')'
        return f_type

    @staticmethod
    def _format_types_count(types_count, array_types_count=None):
        """ Format types_count to a readable sting.

        >>> format_types_count({'integer': 10, 'boolean': 5, 'null': 3, })
        'integer : 10, boolean : 5, null : 3'

        >>> format_types_count({'ARRAY': 10, 'null': 3, }, {'float': 4})
        'ARRAY(float : 4) : 10, null : 3'

        :param types_count: dict
        :param array_types_count: dict, default None
        :return types_count_string : str
        """
        if types_count is None:
            return str(None)

        types_count = sorted(types_count.items(), key=lambda x: x[1], reverse=True)

        type_count_list = list()
        for type_name, count in types_count:
            if type_name == 'ARRAY':
                array_type_name = _SchemaPreProcessing._format_types_count(array_types_count)
                type_count_list.append('ARRAY(' + array_type_name + ') : ' + str(count))
            else:
                type_count_list.append(str(type_name) + ' : ' + str(count))

        types_count_string = ', '.join(type_count_list)
        return types_count_string


class HierarchicalOutput(BaseOutput):
    """
    Abstract base class. Preprocessing for outputs that keep the json hierarchical structure.
    """

    def __init__(self, data, category='schema', without_counts=False, **kwargs):
        """
        :param data: json like structure - schema, mapping, ...
        :param without_counts: bool - default False, remove all count fields in output if True
        :param kwargs: unused - exists for a unified interface with other subclasses of BaseOutput
        """
        data_processor = OutputPreProcessing(category)
        if without_counts:
            self.data = data_processor.filter_data(data)
        else:
            self.data = data


class ListOutput(BaseOutput):
    """
    Abstract base class. Preprocessing for outputs with a table like format.

    Class attribute:
    _default_columns: allow to override PreProcessing class default_columns
                        {category: [default_columns]}
    """
    _default_columns = {}

    @classmethod
    def get_default_columns(cls):
        """List default columns by category"""
        return {
            'schema': cls._default_columns.get('schema', _SchemaPreProcessing.default_columns),
            'mapping': cls._default_columns.get('mapping', _MappingPreProcessing.default_columns),
            'diff': cls._default_columns.get('diff', _DiffPreProcessing.default_columns)}

    def __init__(self, data, category='schema', columns_to_get=None, **kwargs):
        """
        :param data: json like structure - schema, mapping, ...
        :param columns_to_get: list - column names to display in output
                                default will use default_columns class attribute
        :param kwargs: unused - exists for a unified interface with other subclasses of BaseOutput
        """
        data_processor = OutputPreProcessing(category)
        if not columns_to_get:
            columns_to_get = self.get_default_columns()[category]

        self.data_df = data_processor.convert_to_dataframe(data, columns_to_get=columns_to_get)


class JsonOutput(HierarchicalOutput):
    """
    Write data in json file.
    """
    output_format = 'json'

    def opener(self):
        """Use codecs module open function to support non ascii characters."""
        return partial(codecs.open, mode='w', encoding="utf-8")

    def write_data(self, file_descr):
        """Use json module dump function to write into file_descr (opened with opener)."""
        json.dump(self.data, file_descr, indent=4, ensure_ascii=False,
                  default=json_util.default, sort_keys=True)


class YamlOutput(HierarchicalOutput):
    """
    Write data in yaml file.
    """
    output_format = 'yaml'

    def write_data(self, file_descr):
        """Use yaml module safe_dump function to write into file_descr."""
        yaml.safe_dump(self.data, file_descr, default_flow_style=False, encoding='utf-8')


class TsvOutput(ListOutput):
    """
    Write data from self.data_df as a table in csv file.
    """
    output_format = 'tsv'

    def write_data(self, file_descr):
        """Use dataframe to_csv method to write into file_descr."""
        self.data_df.to_csv(file_descr, sep='\t', index=False, encoding="utf-8",
                            quoting=csv.QUOTE_NONE)


class HtmlOutput(ListOutput):
    """
    Write data from self.data_df as a table in html file, one table per collection.

    Uses resources/data_dict.tmpl template.
    """
    output_format = 'html'
    _default_columns = {
        'schema': ['Field_compact_name', 'Field_name', 'Count',
                   'Percentage', 'Types_count']}

    def opener(self):
        """Use codecs module open function to support non ascii characters."""
        return partial(codecs.open, mode='w', encoding="utf-8")

    def write_data(self, file_descr):
        """
        Format data from self.data_df, write into file_descr (opened with opener).
        """
        columns = self.data_df.columns
        col0 = columns[0]  # First column title (usually Database)
        col1 = columns[1]  # Second column title (usually Collection or Table)
        tmpl_variables = OrderedDict()
        for db in self.data_df[col0].unique():
            tmpl_variables[db] = OrderedDict()
            df_db = self.data_df.query('{} == @db'.format(col0)).iloc[:, 1:]
            for col in df_db[col1].unique():
                df_col = df_db.query('{} == @col'.format(col1)).iloc[:, 1:]
                tmpl_variables[db][col] = df_col.values.tolist()

        tmpl_filename = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                     'resources', 'data_dict.tmpl')
        with open(tmpl_filename) as tmpl_fd:
            tmpl = jinja2.Template(tmpl_fd.read())

        file_descr.write(tmpl.render(col_titles=list(self.data_df)[2:],
                                     data=tmpl_variables))


class MdOutput(ListOutput):
    """
    Write data from self.data_df as a table in markdown file, one table per Collection.
    """
    output_format = 'md'
    _default_columns = {
        'schema': ['Field_compact_name', 'Field_name', 'Count',
                   'Percentage', 'Types_count']}

    def opener(self):
        """Use codecs module open function to support non ascii characters."""
        return partial(codecs.open, mode='w', encoding="utf-8")

    def write_data(self, file_descr):
        """
        Format data from self.data_df, write into file_descr (opened with opener).
        """
        columns = list(self.data_df.columns)
        col0 = columns.pop(0)       # First column title (usually Database)
        col1 = columns.pop(0)       # Second column title (usually Collection or Table)
        columns_length = []
        for col in columns:
            columns_length.append(max(self.data_df[col].map(
                lambda s: len(s) if isinstance(s, basestring) else len(str(s))).max(),
                                      len(col)) + 5)

        def format_column(col_name, value, repeat=False):
            """Closure - format columns based on existing data length."""
            col_length = columns_length[columns.index(col_name)]
            if repeat:
                return value * col_length
            return u'{{:<{}}}'.format(col_length).format(u'{}'.format(value))

        str_column_names = self._make_line([format_column(col, col) for col in columns])
        str_sep_header = self._make_line([format_column(col, '-', repeat=True) for col in columns])
        output_str = []
        for db in self.data_df[col0].unique():
            output_str.append('\n### {}: {}\n'.format(col0, db))
            df_db = self.data_df.query('{} == @db'.format(col0)).iloc[:, 1:]
            for col in df_db[col1].unique():
                if col:
                    output_str.append('#### {}: {} \n'.format(col1, col))
                df_col = df_db.query('{} == @col'.format(col1)).iloc[:, 1:]
                output_str.append("\n".join([str_column_names, str_sep_header] +
                                            [self._make_line([format_column(columns[i], value)
                                                              for i, value in enumerate(line)])
                                             for line in df_col.values.tolist()]))
                output_str.append('\n\n')

        file_descr.write("".join(output_str))

    def _make_line(self, values):
        return u'|{}|'.format('|'.join(values))


class XlsxOutput(ListOutput):
    """
    Write data from self.data_df as a table in csv file.
    """
    output_format = 'xlsx'

    def opener(self):
        """
        Just return filename.

        Write_data will manage the opening based on whether the file already exists
        """
        return lambda x: x

    def write_data(self, file_descr):
        """
        Use dataframe to_excel to write into file_descr (filename) - open first if file exists.
        """
        if os.path.isfile(file_descr):
            print(file_descr, 'exists')
            # Solution to keep existing data
            book = load_workbook(file_descr)
            writer = pd.ExcelWriter(file_descr, engine='openpyxl')
            writer.book = book
            writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
            self.data_df.to_excel(writer, sheet_name='Mongo_Schema', index=True,
                                  float_format='%.2f')
            writer.save()
        else:
            self.data_df.to_excel(file_descr, sheet_name='Mongo_Schema', index=True,
                                  float_format='%.2f')


def rec_find_right_subclass(attribute_value, attribute='output_format', start_class=BaseOutput):
    """Find which subclass of start_class should be used (has the right attribute value)"""
    for subclass in start_class.__subclasses__():
        if getattr(subclass, attribute) == attribute_value:
            return subclass
        rec_res = rec_find_right_subclass(attribute_value, start_class=subclass)
        if rec_res:
            return rec_res

    return None


def transform_data_to_file(data, formats, output=None, category='schema', **kwargs):
    """
    Transform data into each of output_formats and write result to output_filename or stdout.

    :param data: dict (schema, mapping or diff)
    :param formats: list of str - extensions of output desired among:
                            'json', 'yaml' (hierarchical formats)
                            'tsv', 'html', 'md' or 'xlsx' (list like formats)
    :param output: str full path to file where formatted output will be saved saved
                            (default is std out)
    :param category: string in 'schema', 'mapping', 'diff' - describe input data
    :param kwargs: may contain additional specific arguments
           columns: list of columns to display in the output for list like formats
           without_counts: bool to display count fields in output for hierarchical formats
    """
    wrong_formats = set(formats) - {'tsv', 'xlsx', 'json', 'yaml', 'html', 'md'}

    if wrong_formats:
        raise ValueError("Output format should be tsv, xlsx, html, md, json or yaml. "
                         "{} is/are not supported".format(wrong_formats))

    for output_format in formats:
        output_maker = rec_find_right_subclass(output_format)(
            data, category=category,
            columns_to_get=kwargs.get('columns'), without_counts=kwargs.get('without_counts'))
        with output_maker.open(output) as file_descr:
            output_maker.write_data(file_descr)