python source code of es_doc

"""Publishes documents to Elasticsearch"""
import socket
import asyncio
from queue import Queue
from datetime import (datetime)
import math
from pytz import utc
from homeassistant.const import (CONF_DOMAINS, CONF_ENTITIES, CONF_EXCLUDE)
from homeassistant.helpers import (
    state as state_helper
)
from .const import (
    CONF_TAGS, CONF_PUBLISH_FREQUENCY, CONF_ONLY_PUBLISH_CHANGED
)

from .logger import LOGGER

from .es_serializer import get_serializer

class DocumentPublisher:
    """Publishes documents to Elasticsearch"""

    def __init__(self, config, gateway, index_manager, hass, system_info):
        """Initialize the publisher"""
        self._gateway = gateway
        self._hass = hass

        self._index_alias = index_manager.index_alias

        self._serializer = get_serializer()

        config_dict = hass.config.as_dict()
        self._static_doc_properties = {
            'agent.name': config_dict['name'] if 'name' in config_dict else 'My Home Assistant',
            'agent.type': 'hass',
            'agent.version': system_info['version'],
            'ecs.version': '1.0.0',
            'host.geo.location': {
                'lat': config_dict['latitude'],
                'lon': config_dict['longitude']
            } if 'latitude' in config_dict else None,
            'host.architecture': system_info['arch'],
            'host.os.name': system_info['os_name'],
            'host.hostname': socket.gethostname(),
            'tags': config.get(CONF_TAGS)
        }

        self._publish_frequency = config.get(CONF_PUBLISH_FREQUENCY)
        self._only_publish_changed = config.get(CONF_ONLY_PUBLISH_CHANGED)

        excluded = config.get(CONF_EXCLUDE)
        self._excluded_domains = excluded.get(CONF_DOMAINS)
        self._excluded_entities = excluded.get(CONF_ENTITIES)

        if self._excluded_domains:
            LOGGER.debug("Excluding the following domains: %s",
                         str(self._excluded_domains))

        if self._excluded_entities:
            LOGGER.debug("Excluding the following entities: %s",
                         str(self._excluded_entities))

        self.publish_queue = Queue()
        self._last_publish_time = None

        self._start_publish_timer()

    def queue_size(self):
        """Returns the approximate queue size"""
        return self.publish_queue.qsize()

    def last_publish_time(self):
        """Returns the last publish time"""
        return self._last_publish_time

    def enqueue_state(self, entry):
        """queues up the provided state change"""
        state = entry['state']
        domain = state.domain
        entity_id = state.entity_id

        if domain in self._excluded_domains:
            LOGGER.debug(
                "Skipping %s: it belongs to an excluded domain", entity_id)
            return

        if entity_id in self._excluded_entities:
            LOGGER.debug(
                "Skipping %s: this entity is explicitly excluded", entity_id)
            return

        self.publish_queue.put(entry)

    async def async_do_publish(self):
        "Publishes all queued documents to the Elasticsearch cluster"
        from elasticsearch import ElasticsearchException

        if self.publish_queue.empty():
            LOGGER.debug("Skipping publish because queue is empty")
            return

        LOGGER.debug("Collecting queued documents for publish")
        actions = []
        entity_counts = {}
        self._last_publish_time = datetime.now()

        while not self.publish_queue.empty():
            entry = self.publish_queue.get()

            key = entry["state"].entity_id

            entity_counts[key] = 1 if key not in entity_counts else entity_counts[key] + 1
            actions.append(self._state_to_bulk_action(
                entry["state"], entry["event"].time_fired))

        if not self._only_publish_changed:
            all_states = self._hass.states.async_all()
            for state in all_states:
                if (state.domain in self._excluded_domains
                        or state.entity_id in self._excluded_entities):
                    continue

                if state.entity_id not in entity_counts:
                    actions.append(self._state_to_bulk_action(
                        state, self._last_publish_time))

        LOGGER.info("Publishing %i documents to Elasticsearch", len(actions))

        try:
            await self._hass.async_add_executor_job(self.bulk_sync_wrapper, actions)
        except ElasticsearchException as err:
            LOGGER.exception(
                "Error publishing documents to Elasticsearch: %s", err)
        return

    def bulk_sync_wrapper(self, actions):
        """
        Wrapper to publish events.
        Workaround for elasticsearch_async not supporting bulk operations
        """
        from elasticsearch import ElasticsearchException
        from elasticsearch.helpers import bulk

        try:
            bulk_response = bulk(self._gateway.get_sync_client(), actions)
            LOGGER.debug("Elasticsearch bulk response: %s",
                         str(bulk_response))
            LOGGER.info("Publish Succeeded")
        except ElasticsearchException as err:
            LOGGER.exception(
                "Error publishing documents to Elasticsearch: %s", err)

    def _state_to_bulk_action(self, state, time):
        """Creates a bulk action from the given state object"""
        try:
            _state = state_helper.state_as_number(state)
            if not is_valid_number(_state):
                _state = state.state
        except ValueError:
            _state = state.state

        if time.tzinfo is None:
            time_tz = time.astimezone(utc)
        else:
            time_tz = time

        orig_attributes = dict(state.attributes)
        attributes = dict()
        for orig_key, orig_value in orig_attributes.items():
            # ES will attempt to expand any attribute keys which contain a ".",
            # so we replace them with an "_" instead.
            # https://github.com/legrego/homeassistant-elasticsearch/issues/92
            key = str.replace(orig_key, ".", "_")
            value = orig_value

            # Skip any attributes with empty keys. Elasticsearch cannot index these.
            # https://github.com/legrego/homeassistant-elasticsearch/issues/96
            if not key:
                LOGGER.warning(
                    "Not publishing keyless attribute from entity [%s].",
                    state.entity_id
                )
                continue

            # coerce set to list. ES does not handle sets natively
            if isinstance(orig_value, set):
                value = list(orig_value)

            # if the list/tuple contains simple strings, numbers, or booleans, then we should
            # index the contents as an actual list. Otherwise, we need to serialize
            # the contents so that we can respect the index mapping
            # (Arrays of objects cannot be indexed as-is)
            if value and isinstance(value, (list, tuple)):
                should_serialize = isinstance(
                    value[0], (tuple, dict, set, list))
            else:
                should_serialize = isinstance(value, dict)

            attributes[key] = self._serializer.dumps(
                value) if should_serialize else value

        document_body = {
            'hass.domain': state.domain,
            'hass.object_id': state.object_id,
            'hass.object_id_lower': state.object_id.lower(),
            'hass.entity_id': state.entity_id,
            'hass.entity_id_lower': state.entity_id.lower(),
            'hass.attributes': attributes,
            'hass.value': _state,
            '@timestamp': time_tz
        }

        document_body.update(self._static_doc_properties)

        if ('latitude' in document_body['hass.attributes']
                and 'longitude' in document_body['hass.attributes']):
            document_body['hass.geo.location'] = {
                'lat': document_body['hass.attributes']['latitude'],
                'lon': document_body['hass.attributes']['longitude']
            }

        es_version = self._gateway.es_version
        if es_version.major == 6:
            return {
                "_op_type": "index",
                "_index": self._index_alias,
                "_type": "doc",
                "_source": document_body
            }
        return {
            "_op_type": "index",
            "_index": self._index_alias,
            "_source": document_body
        }

    def _start_publish_timer(self):
        """Initialize the publish timer"""
        asyncio.ensure_future(self._publish_queue_timer())

    def _should_publish(self):
        """Determines if now is a good time to publish documents"""
        if self.publish_queue.empty():
            LOGGER.debug("should_publish: queue is empty")
            return False

        return True

    async def _publish_queue_timer(self):
        """The publish queue timer"""
        LOGGER.debug("Starting publish timer: executes every %i seconds.",
                     self._publish_frequency)
        while True:
            try:
                if self._should_publish():
                    await self.async_do_publish()
                else:
                    LOGGER.debug("Nothing to publish")
            finally:
                await asyncio.sleep(self._publish_frequency)


def is_valid_number(number):
    """Determines if the passed number is valid for Elasticsearch"""
    is_infinity = math.isinf(number)
    is_nan = number != number  # pylint: disable=comparison-with-itself
    return not is_infinity and not is_nan