python source code of core

# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.


""" Schema definition toolkit using Python classes

Usage example:

>>> class Foo(Record):
... bin = Bytes()
...
... class MyRecord(Record):
...     a_string = Text()
...     a_float = Float()
...     record = List(SubRecord(Foo))
...
... rec = MyRecord(a_string="hej")
... rec.record = [Foo(bin="bar")]
...
... s = dumps(rec)
... print loads(s)


Internals:

A valid PySchema class contains the following class variables:

`_fields`
    An OrderedDict of `field_name` => `field_type`
    where `field_type` is an instance of a Field subclass

`_schema_name`
    The qualifying name for this schema. This is used for registering a record
    in a `SchemaStore` and for auto-identification of serialized records.
    Should be unique within a specific SchemaStore, so if auto registering is
    used it should be unique within the execution chain of the current program.

"""
from __future__ import absolute_import
from abc import ABCMeta, abstractmethod
from itertools import izip

try:
    from collections import OrderedDict
except ImportError:
    from ordereddict import OrderedDict

import warnings
import types   # absolute import, this is the python standard library types
try:
    import simplejson as json
except ImportError:
    import json


SCHEMA_FIELD_NAME = "$schema"


class ParseError(Exception):
    """ Generic exception type for Record parse errors """
    pass


class InvalidSchemaSpecification(object):
    """
    Utility class that can be used to raise an exception on schema usage.

    This is used in the schema store as a placeholder for invalid schemas. Instead of raising when a schema is
    registered in the store, something that happens on import, we use this class to raise on usage.
    """
    def __init__(self, exception_msg):
        self.exception_msg = exception_msg

    def __getattr__(self, item):
        raise ValueError(self.exception_msg)


class SchemaStore(object):
    def __init__(self):
        self._schema_map = {}
        self._enum_map = {}

    def __str__(self):
        return str(self._schema_map.keys())

    def add_record(self, schema, _bump_stack_level=False):
        """ Add record class to record store for retrieval at record load time.

            Can be used as a class decorator
        """
        full_name = get_full_name(schema)
        has_namespace = '.' in full_name
        self._force_add(full_name, schema, _bump_stack_level, _raise_on_existing=has_namespace)
        if has_namespace and schema.__name__ not in self._schema_map:
            self._force_add(schema.__name__, schema, _bump_stack_level)
        return schema

    def add_enum(self, enum_definition):
        new_values_set = set(enum_definition.values)
        old_values_set = self._enum_map.get(enum_definition.name)

        if old_values_set is not None and new_values_set != old_values_set:
            warnings.warn(
                "Enum {!r} overwritten! Was: {}, Overwritten by: {}".format(
                    enum_definition.name,
                    old_values_set,
                    new_values_set
                )
            )

        if enum_definition.name is not None:
            self._enum_map[enum_definition.name] = enum_definition.values
        # return the definition to allow the method to be used as a decorator
        return enum_definition

    def _force_add(self, used_name, schema, _bump_stack_level=False, _raise_on_existing=False):
        existing = self._schema_map.get(used_name, None)
        if existing and existing != schema:
            full_name = get_full_name(schema)
            explanation = "(actually {0})".format() if full_name != used_name else ""

            warnings.warn(
                "{used_name}{explanation}: old definition in {prev_module} replaced by definition in {new_module}"
                .format(used_name=used_name,
                        explanation=explanation,
                        prev_module=existing.__module__,
                        new_module=schema.__module__),
                stacklevel=4 if _bump_stack_level else 3)

            if _raise_on_existing:
                if not isinstance(existing, InvalidSchemaSpecification):
                    schema = InvalidSchemaSpecification(
                        'Attempted to access data from a dubious schema specification. '
                        'The schema for: {used_name} was provided by both {existing} and {new}'
                        .format(used_name=used_name, existing=existing, new=schema))
                else:
                    schema = existing

        self._schema_map[used_name] = schema

    def get(self, record_name):
        """
        Will return a matching record or raise KeyError is no record is found.

        If the record name is a full name we will first check for a record matching the full name.
        If no such record is found any record matching the last part of the full name (without the namespace) will
        be returned.
        """
        if record_name in self._schema_map:
            return self._schema_map[record_name]
        else:
            last_name = record_name.split('.')[-1]
            return self._schema_map[last_name]

    def get_enum(self, name):
        return self._enum_map[name]

    def clear(self):
        self._schema_map.clear()
        self._enum_map.clear()

    def clone(self):
        r = SchemaStore()
        r._schema_map = self._schema_map.copy()
        r._enum_map = self._enum_map.copy()

        return r

    def has_schema(self, name):
        if name in self._schema_map:
            return True
        if "." in name:
            basename = name.split(".")[-1]
            return basename in self._schema_map
        return False

    def has_enum(self, name):
        return name in self._enum_map

    def __contains__(self, schema):
        return schema in self._schema_map.values()


class RecordStore(SchemaStore):
    def __init__(self):
        warnings.warn("RecordStore is deprecated and has been renamed to SchemaStore", DeprecationWarning, stacklevel=2)
        super(RecordStore, self).__init__()


def get_full_name(schema):
    full_name = schema.__name__
    if hasattr(schema, '_namespace'):
        full_name = '.'.join([schema._namespace, schema.__name__])
    elif hasattr(schema, '_avro_namespace_'):
        warnings.warn("_avro_namespace is deprecated, use _namespace instead", DeprecationWarning, stacklevel=3)
        full_name = '.'.join([schema._avro_namespace_, schema.__name__])
    return full_name


class _NoDefault:
    def __repr__(self):
        return "NO_DEFAULT"

# NO_DEFAULT is a special value to signify that a field has no default value
# and should fail to serialize unless a value has been assigned
# it's the default default-value for all non-nullable fields

NO_DEFAULT = _NoDefault()

_UNTOUCHED = object()


class Field(object):
    __metaclass__ = ABCMeta
    _next_index = 0

    def __init__(self, description=None, nullable=True, default=_UNTOUCHED):
        self.description = description
        self._index = Field._next_index
        self.nullable = nullable
        if default is _UNTOUCHED:
            # if default isn't explicitly set
            # use None for Nullables, and NO_DEFAULT for others
            if nullable:
                default = None
            else:
                default = NO_DEFAULT

        self.default = default
        Field._next_index += 1  # used for arg order in initialization

    def repr_vars(self):
        """Return a dictionary the field definition

        Should contain all fields that are required for the definition of this field in a pyschema class"""
        d = OrderedDict()
        d["nullable"] = repr(self.nullable)
        d["default"] = repr(self.default)
        if self.description is not None:
            d["description"] = repr(self.description)
        return d

    def __repr__(self):
        strings = ('{0}={1}'.format(vname, val) for vname, val in self.repr_vars().iteritems())
        return self.__class__.__name__ + '(' + ', '.join(strings) + ')'

    def set_parent(self, schema):
        # no-op by default but can be overridden by types
        # that need parent references
        pass

    @abstractmethod
    def dump(self, obj):
        pass

    @abstractmethod
    def load(self, obj):
        pass

    @classmethod
    def mixin(cls, mixin_cls):
        """Decorator for mixing in additional functionality into field type

        Example:

        >>> @Integer.mixin
        ... class IntegerPostgresExtensions:
        ...     postgres_type = 'INT'
        ...
        ...     def postgres_dump(self, obj):
        ...         self.dump(obj) + "::integer"

        Is roughly equivalent to:

        >>> Integer.postgres_type = 'INT'
        ...
        ... def postgres_dump(self, obj):
        ...     self.dump(obj) + "::integer"
        ...
        ... Integer.postgres_dump = postgres_dump

        """
        for item_name in dir(mixin_cls):
            if item_name.startswith("__"):
                # don't copy magic properties
                continue
            item = getattr(mixin_cls, item_name)

            if isinstance(item, types.MethodType):
                # unbound method will cause problems
                # so get the underlying function instead
                item = item.im_func

            setattr(cls, item_name, item)
        return mixin_cls

    def default_value(self):
        return self.default

    def is_similar_to(self, other):
        return(
            type(self) == type(other) and
            self.default == other.default and
            self.nullable == other.nullable and
            self.description == other.description
        )

auto_store = SchemaStore()


class PySchema(ABCMeta):
    """Metaclass for Records

    Builds schema on Record declaration and remembers Record types
    for easy generic parsing
    """
    auto_register = True

    def __new__(metacls, name, bases, dct):
        schema_attrs = metacls._get_schema_attributes(
            name=name,
            bases=bases,
            dct=dct
        )
        dct.update(schema_attrs)
        cls = ABCMeta.__new__(metacls, name, bases, dct)

        # allow self-references etc.
        for field_name, field in cls._fields.iteritems():
            field.set_parent(cls)

        if metacls.auto_register:
            auto_store.add_record(cls, _bump_stack_level=True)
        return cls

    @classmethod
    def _field_dupe_warning(metacls, name, fields):
        warnings.warn(
            "{schema}: Duplicate field definition for field{plural} {field}"
                .format(
                    schema=name,
                    field=fields,
                    plural="s" if len(fields) > 1 else ""
                ),
            stacklevel=4
        )

    @classmethod
    def _get_schema_attributes(metacls, name, bases, dct):
        fields = OrderedDict()
        for b in bases:
            if not isinstance(b, metacls):
                continue

            field_intersection = set(fields) & set(b._fields)
            if field_intersection:
                metacls._field_dupe_warning(name, field_intersection)
            fields.update(b._fields)

        new_fields = []
        for field_name, field_def in dct.iteritems():
            if isinstance(field_def, Field):
                new_fields.append((field_name, field_def))

        new_fields.sort(key=lambda fd: fd[1]._index)
        for field_name, field_def in new_fields:
            if field_name in fields:
                metacls._field_dupe_warning(name, (field_name,))
            fields[field_name] = field_def

        return {
            "_fields": fields,
            "_schema_name": name,
        }

    @classmethod
    def from_class(metacls, cls, auto_store=True):
        """Create proper PySchema class from cls

        Any methods and attributes will be transferred to the
        new object
        """
        if auto_store:
            def wrap(cls):
                return cls
        else:
            wrap = no_auto_store()

        return wrap(metacls.__new__(
            metacls,
            cls.__name__,
            (Record,),
            dict(cls.__dict__)
        ))


def disable_auto_register():
    PySchema.auto_register = False


def enable_auto_register():
    PySchema.auto_register = True


def no_auto_store():
    """ Temporarily disable automatic registration of records in the auto_store

    Decorator factory. This is _NOT_ thread safe

    >>> @no_auto_store()
    ... class BarRecord(Record):
    ...     pass
    >>> BarRecord in auto_store
    False

    """
    original_auto_register_value = PySchema.auto_register
    disable_auto_register()

    def decorator(cls):
        PySchema.auto_register = original_auto_register_value
        return cls

    return decorator


@no_auto_store()
class Record(object):
    """Abstract base class for structured logging records"""
    __metaclass__ = PySchema

    def __init__(self, *args, **kwargs):
        if args:
            # The idea behind only allowing keyword arguments
            # is to prevent accidental misuse of a changed schema
            raise TypeError('Non-keyword arguments not allowed'
                            ' when initializing Records')

        for k, field_type in self._fields.items():
            if k in kwargs:
                value = kwargs.get(k)
            else:
                value = field_type.default_value()

            object.__setattr__(self, k, value)

    def __setattr__(self, name, value):
        if name not in self._fields:
            raise AttributeError(
                "No field %r in %s"
                % (name, self._schema_name)
            )

        super(Record, self).__setattr__(name, value)

    def __unicode__(self):
        return str(self).decode('ascii')

    def __str__(self):
        return repr(self)

    def __repr__(self):
        strings = ('%s=%r' % (fname, getattr(self, fname))
                   for fname, f in self._fields.iteritems())

        return self._schema_name + '(' + ', '.join(strings) + ')'

    def __cmp__(self, other):
        if not isinstance(other, Record):
            # return default implementation cmp value
            return cmp(id(self), other)
        if self._schema_name != other._schema_name:
            return cmp(self._schema_name, other._schema_name)
        fields = self._fields.keys()
        a = (getattr(self, key) for key in fields)
        b = (getattr(other, key) for key in fields)

        for _a, _b in izip(a, b):
            r = cmp(_a, _b)
            if r:
                return r
        return 0

    def __eq__(self, other):
        return self.__cmp__(other) == 0

    def __ne__(self, other):
        return self.__cmp__(other) != 0


def to_json_compatible(record):
    "Dump record in json-encodable object format"
    d = {}
    for fname, f in record._fields.iteritems():
        val = getattr(record, fname)
        if val is not None:
            d[fname] = f.dump(val)
    return d


def from_json_compatible(schema, dct):
    "Load from json-encodable"
    kwargs = {}

    for key in dct:
        field_type = schema._fields.get(key)
        if field_type is None:
            raise ParseError("Unexpected field encountered in line for record %s: %s" % (schema.__name__, key))
        kwargs[key] = field_type.load(dct[key])

    return schema(**kwargs)


def ispyschema(schema):
    """ Is object PySchema instance?

    Returns true for PySchema Record *classes*
    i.e. NOT when schema is a *Record* instance

    >>> class FooRecord(Record):
    ...     pass
    >>> ispyschema(FooRecord)
    True
    >>> ispyschema(FooRecord())
    False
    """
    return isinstance(schema, PySchema)


def load_json_dct(
        dct,
        record_store=None,
        schema=None,
        loader=from_json_compatible
):
    """ Create a Record instance from a json-compatible dictionary

    The dictionary values should have types that are json compatible,
    as if just loaded from a json serialized record string.

    :param dct:
        Python dictionary with key/value pairs for the record

    :param record_store:
        Record store to use for schema lookups (when $schema field is present)

    :param schema:
        PySchema Record class for the record to load.
        This will override any $schema fields specified in `dct`

    """
    if schema is None:
        if record_store is None:
            record_store = auto_store
        try:
            schema_name = dct.pop(SCHEMA_FIELD_NAME)
        except KeyError:
            raise ParseError((
                "Serialized record missing '{0}' "
                "record identifier and no schema supplied")
                .format(SCHEMA_FIELD_NAME)
            )
        try:
            schema = record_store.get(schema_name)
        except KeyError:
            raise ParseError(
                "Can't recognize record type %r"
                % (schema_name,), schema_name)

    # if schema is explicit, use that instead of SCHEMA_FIELD_NAME
    elif SCHEMA_FIELD_NAME in dct:
        dct.pop(SCHEMA_FIELD_NAME)

    record = loader(schema, dct)
    return record


def loads(
        s,
        record_store=None,
        schema=None,
        loader=from_json_compatible,
        record_class=None  # deprecated in favor of schema
):
    """ Create a Record instance from a json serialized dictionary

    :param s:
        String with a json-serialized dictionary

    :param record_store:
        Record store to use for schema lookups (when $schema field is present)

    :param loader:
        Function called to fetch attributes from json. Typically shouldn't be used by end users

    :param schema:
        PySchema Record class for the record to load.
        This will override any $schema fields specified in `s`

    :param record_class:
        DEPRECATED option, old name for the `schema` parameter

    """
    if record_class is not None:
        warnings.warn(
            "The record_class parameter is deprecated in favour of schema",
            DeprecationWarning,
            stacklevel=2
        )
        schema = record_class
    if not isinstance(s, unicode):
        s = s.decode('utf8')
    if s.startswith(u"{"):
        json_dct = json.loads(s)
        return load_json_dct(json_dct, record_store, schema, loader)
    else:
        raise ParseError("Not a json record")


def dumps(obj, attach_schema_name=True):
    json_dct = to_json_compatible(obj)
    if attach_schema_name:
        json_dct[SCHEMA_FIELD_NAME] = get_full_name(obj.__class__)

    json_string = json.dumps(json_dct)
    return json_string