#!/usr/bin/python# # coding=utf-8 ''' Daemon that reports stats for this MonacoNode's Apps ''' from __future__ import print_function, division import redis import schema import redismgmt import time import logging import socket import psutil import threading import json import nutmgmt import config STATLOGGER = logging.getLogger('monaco.stats') PUBLISHER = config.config['statpublisher']() class MonacoHandler(threading.Thread): ''' Thread that handles a specific app's stat reporting, for better cpu monitoring ''' def __init__(self, node_id): threading.Thread.__init__(self) self.r = redis.StrictRedis(port=config.config['mgmt_port']) self.node_id = node_id self.interval = config.config['stats']['interval'] self.threshold = 3 self.failcount = 0 self.logger = STATLOGGER self.redmgr = redismgmt.RedisMgmt() self._run = True def stop(self): ''' stop thread ''' self._run = False def run(self): ''' Runs until no longer valid (app moved, not master, etc), or repeated errors over self.threshold ''' while self._run: start_time = time.time() if self.failcount >= self.threshold: self.logger.error('Node stat reporter thread for Monaco DB is sleeping to prevent thrashing') time.sleep(15) # Collect/Publish stats try: info = self.r.info() for stat in ['connected_slaves', 'used_memory', 'connected_clients', 'instantaneous_ops_per_sec']: PUBLISHER.publish_monaco_stat(self.node_id, stat, info[stat]) proc = self.redmgr.instance_proc('6379') if proc: PUBLISHER.publish_monaco_stat(self.node_id, 'cpu_percent', proc.cpu_percent(interval=self.interval)) except redis.ConnectionError: self.logger.info('Redis error accessing Monaco DB info') del self.r self.r = redis.StrictRedis(port=config.config['mgmt_port']) except psutil.Error, err: self.logger.info('psutil error - %s', repr(err)) self.failcount += 1 except Exception, exc: self.logger.exception(exc) self.failcount += 1 spent_time = time.time() - start_time if self.interval > spent_time: time.sleep(self.interval - spent_time) self.logger.debug('monaco stat handler reporting cycle') self.failcount = 0 class AppHandler(threading.Thread): ''' Thread that handles a specific app's stat reporting, for better cpu monitoring ''' def __init__(self, app_id, node_id, threshold=3): threading.Thread.__init__(self) self.app_id = app_id self.app = schema.App(app_id=self.app_id) self.r = redis.StrictRedis(port=config.config['mgmt_port']) self.app.refresh(self.r) self.app_conn = self.app.get_master_connection(self.r) self.node_id = node_id self.interval = config.config['stats']['interval'] self.threshold = threshold self.failcount = 0 self.logger = STATLOGGER self.redmgr = redismgmt.RedisMgmt() self._run = True def stop(self): ''' stop thread ''' self._run = False def run(self): ''' Runs until no longer valid (app moved, not master, etc), or repeated errors over self.threshold ''' while self._run: start_time = time.time() if self.failcount >= self.threshold: self.logger.warn('Node stat reporter thread for app %s is terminating from thrashing', self.app_id) return # Refresh App and verify our responsibility try: self.app.refresh(self.r) if not self.node_id in self.app.nodes or self.app.nodes[self.node_id] != 'master': # No longer this node's responsibility to report stats return except redis.RedisError: self.logger.info('RedisError updating app %s', self.app_id) del self.r self.r = redis.StrictRedis(port=config.config['mgmt_port']) self.failcount += 1 continue except Exception, exc: self.logger.exception(exc) self.failcount += 1 continue # Collect/Publish stats try: info = self.app_conn.info() for stat in ['connected_slaves', 'used_memory', 'connected_clients', 'instantaneous_ops_per_sec']: PUBLISHER.publish_app_stat(self.app_id, stat, info[stat]) proc = self.redmgr.instance_proc(self.app_id) if proc: PUBLISHER.publish_app_stat(self.app_id, 'cpu_percent', proc.cpu_percent(interval=self.interval)) PUBLISHER.publish_app_stat(self.app_id, 'master_unavailable', 0) except redis.ConnectionError: self.logger.info('Redis error accessing %s info', self.app_id) PUBLISHER.publish_app_stat(self.app_id, 'master_unavailable', 1) del self.app_conn self.app_conn = self.app.get_master_connection(self.r) except psutil.Error, err: self.logger.info('psutil error - %s', repr(err)) time.sleep(self.interval) # since the wait time might not have been observed self.failcount += 1 except Exception, exc: self.logger.exception(exc) self.failcount += 1 spent_time = time.time() - start_time if self.interval > spent_time: time.sleep(self.interval - spent_time) self.logger.debug('stat handler(%s) reporting cycle', self.app_id) self.failcount = 0 class TwemHandler(threading.Thread): ''' Thread that handles a specific proxy's stat reporting ''' def __init__(self, twem_id, node_id, host, threshold=3): threading.Thread.__init__(self) self.twem_id = twem_id self.node_id = node_id self.twem = schema.MonacoTwem(twem_id=twem_id) self.host = host self.nutmgr = nutmgmt.NutMgmt() self.interval = config.config['stats']['interval'] self.threshold = threshold self.failcount = 0 self.logger = STATLOGGER self._run = True def get_twem_stat(self): ''' Returns JSON-parsed data for self.twem. self.twem should be 'refreshed' ''' try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(('localhost', self.twem.stat_port)) data = json.load(sock.makefile()) return data[self.twem.name] except Exception: STATLOGGER.warn('Exception getting proxy stats for twem %s', self.twem.twem_id) return None def stop(self): ''' stops the thread ''' self._run = False def run(self): ''' Runs until no longer valid (app moved, not master, etc), or repeated errors over self.threshold ''' self.r = redis.StrictRedis(port=config.config['mgmt_port']) while self._run: start_time = time.time() if self.failcount >= self.threshold: self.logger.warn('Node stat reporter thread for twem %s is terminating from thrashing', self.twem_id) return # Refresh App and verify our responsibility try: self.twem.refresh(self.r) if not self.node_id in self.twem.nodes: # No longer this node's responsibility to report stats return data = self.get_twem_stat() self.logger.debug(repr(data)) except redis.RedisError: self.logger.info('RedisError updating twem %s', self.twem_id) self.r = redis.StrictRedis(port=config.config['mgmt_port']) self.failcount += 1 continue except Exception, exc: self.logger.exception(exc) self.failcount += 1 continue # Collect/Publish stats try: for stat in ['client_connections', 'client_err', 'forward_error']: PUBLISHER.publish_twem_stat(self.twem_id, self.host, stat, data[stat]) proc = self.nutmgr.nutcracker_instance_proc(self.twem) if proc: PUBLISHER.publish_twem_stat(self.twem_id, self.host, 'cpu_percent', proc.cpu_percent(interval=self.interval)) PUBLISHER.publish_twem_stat(self.twem_id, self.host, 'host_unavailable', 0) except psutil.Error, err: self.logger.info('psutil error - %s', repr(err)) time.sleep(self.interval) # since the wait time might not have been observed self.failcount += 1 except Exception, exc: self.logger.exception(exc) PUBLISHER.publish_twem_stat(self.twem_id, self.host, 'host_unavailable', 1) self.failcount += 1 spent_time = time.time() - start_time if self.interval > spent_time: time.sleep(self.interval - spent_time) self.logger.debug('stat handler(%s) reporting cycle', self.twem_id) self.failcount = 0 def main(): ''' This is a jazzier version of the node stats reporter. It will spin up N threads (where N = the number of app Masters on this node) Those threads will report stats on the config interval ''' r = redis.StrictRedis(port=config.config['mgmt_port']) monaco = schema.Monaco() monaco.refresh(r) host = config.config['hostname'] node_id = monaco.node_ids_by_hostname[host] node = schema.MonacoNode(node_id=node_id) monaco_handler = MonacoHandler(node_id) monaco_handler.start() app_threadmap = {} twem_threadmap = {} while True: try: node.refresh(r) # Set up this node's master DB handlers for app_id in app_threadmap.keys(): if app_id not in node.apps: # child thread should die a natural, painless death app_threadmap[app_id].stop() del app_threadmap[app_id] STATLOGGER.debug('deleted %s', app_id) for app_id in node.apps: app = schema.App(app_id=app_id) app.refresh(r) if app.nodes[node.node_id] != 'master': if app_id in app_threadmap: app_threadmap[app_id].stop() del app_threadmap[app_id] STATLOGGER.debug('deleted %s', app_id) continue if not app_id in app_threadmap: # perhaps a new thing app_threadmap[app_id] = AppHandler(app_id, node_id) app_threadmap[app_id].start() STATLOGGER.debug('started %s', app_id) elif not app_threadmap[app_id].is_alive(): del app_threadmap[app_id] app_threadmap[app_id] = AppHandler(app_id, node_id) app_threadmap[app_id].start() STATLOGGER.info('restarted %s', app_id) # Set up this node's twem handlers for twem_id in twem_threadmap.keys(): if twem_id not in node.twems: # child thread should die a natural, painless death twem_threadmap[twem_id].stop() del twem_threadmap[twem_id] STATLOGGER.debug('deleted %s', twem_id) for twem_id in node.twems: twem = schema.MonacoTwem(twem_id=twem_id) twem.refresh(r) if not twem_id in twem_threadmap: # perhaps a new thing twem_threadmap[twem_id] = TwemHandler(twem_id, node_id, host) twem_threadmap[twem_id].start() STATLOGGER.debug('started %s', twem_id) elif not twem_threadmap[twem_id].is_alive(): del twem_threadmap[twem_id] twem_threadmap[twem_id] = TwemHandler(twem_id, node_id, host) twem_threadmap[twem_id].start() STATLOGGER.info('restarted %s', twem_id) except redis.RedisError: r = redis.StrictRedis(port=config.config['mgmt_port']) except Exception, exc: STATLOGGER.exception(exc) time.sleep(5) if __name__ == '__main__': config.initLoggers() main()