# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ This module includes helpers for capturing and storing measurement data. """ import traceback import signal import atexit import itertools from collections import OrderedDict, deque from contextlib import contextmanager import tempfile import csv import os from switchy import utils import multiprocessing as mp import time try: import pandas as pd except ImportError as ie: utils.log_to_stderr().warn(str(ie)) pd = None else: from . import shmarray # use the entire screen width + wrapping when viewing frames in the console pd.set_option('display.expand_frame_repr', False) # app names should generally be shorter then this... min_size = 30 def moving_avg(x, n=100): '''Compute the windowed arithmetic mean of `x` with window length `n` ''' n = min(x.size, n) cs = pd.np.cumsum(x) cs[n:] = cs[n:] - cs[:-n] # cs[n - 1:] / n # true means portion return cs / n # NOTE: first n-2 vals are not true means _storetypes = {} def store(cls): _storetypes[cls.ext] = cls return cls def get_storetype(ext): return _storetypes[ext] def tmpfile(ext): return tempfile.mktemp() + '_switchy_data.{}'.format(ext) class Terminate(Exception): """"A unique error type to trigger writer proc termination """ @store class HDFStore(object): """HDF5 storage. Wraps a `pandas.HDFStore` for use with multiple processes. """ key = 'data' # table key # app names should generally be shorter then this... min_size = 30 ext = 'hdf5' def __init__(self, path, dtypes=None): self.path = path self.dtypes = dtypes self._store = pd.HDFStore(path=path, mode='a') self._store.close() @classmethod @contextmanager def reader(cls, path, dtypes=None): with cls(path).open(mode='r') as store: yield store @classmethod @contextmanager def writer(cls, path, dtypes=None, mode='a'): with cls(path).open(mode=mode) as store: yield store @contextmanager def open(self, mode='r'): self._store.open(mode=mode) yield self self._store.close() def put(self, df, key=None): """Write a `pd.DataFrame` to disk by appending it to the HDF table. Note: the store must already have been opened by the caller. """ self._store.append( key or self.key, df, dropna=False, min_itemsize=self.min_size, ) self._store.flush(fsync=True) def read(self): with self.open(): return self._store[self.key] @property def data(self): return self.read() def __len__(self): with self.open(): return len(self._store.keys()) @classmethod def multiwrite(cls, storepath, dfitems): """"Store all data frames (from `dfitems`) in a single hdf5 file. """ with cls.writer("{}.{}".format(storepath, cls.ext)) as store: for path, df in dfitems: store.put(df, key=path) return store.path @classmethod def multiread(cls, storepath, dtypes=None): with cls.reader(storepath) as store: store = store._store return pd.concat( (store[key] for key in store.keys()), axis=1, ) @store class CSVStore(object): """CSV storage. """ ext = 'csv' def __init__(self, path, dtypes=None): self.path = path # check for a literal numpy dtype dtypes = getattr(dtypes, 'descr', dtypes) if dtypes is not None and iter(dtypes): # handle pandas `DataFrame.dtypes` items = getattr(dtypes, 'iteritems', None) if items: dtypes = items() self.dtypes = OrderedDict(dtypes) self.fields = self.dtypes.keys() else: self.dtypes = self.fields = dtypes self._ondisk = False self.csvfile = self.csvreader = self.csvwriter = None if not os.path.exists(path): self._headerlen = 0 else: with self.open(): self._headerlen = self.bytelen() def bytelen(self): """Report the current length bytes written to disk """ self.csvfile.seek(0, 2) return self.csvfile.tell() def ondisk(self): if not self._ondisk: try: with self.open(): self._ondisk = bool(self.bytelen() > self._headerlen) return self._ondisk except IOError: return False return True @contextmanager def open(self, mode='r', path=None): with open(path or self.path, mode=mode) as csvfile: self.csvfile = csvfile yield self self.csvfile = None @classmethod @contextmanager def reader(cls, path, dtypes=None): with cls(path, dtypes=dtypes).open() as self: self.csvreader = csv.reader(self.csvfile) yield self self.csvreader = None @classmethod @contextmanager def writer(cls, path, dtypes=None, mode='a'): existed = os.path.exists(path) with cls(path, dtypes=dtypes).open(mode=mode) as self: self.csvwriter = csv.writer(self.csvfile) # write a header line if no prior file existed if not existed and self.fields: self.csvwriter.writerow(self.fields) self._headerlen = self.bytelen() yield self self.csvwriter = None if pd: def put(self, df): """Append a `pd.DataFrame` to our csv file """ df.to_csv(self.path, header=False, mode='a') def read(self): """Read the entire csv data set into a `pd.DataFrame` """ return pd.read_csv(self.path, dtype=self.dtypes) else: def put(self, row): """Append an array's worth of data points to to our csv file. Note: this store must be opened as a writer prior to using this method. """ self.csvwriter.writerow(row) self.csvfile.flush() def read(self): """Read the entire csv data set into a list of lists (the rows). """ with self.reader(self.path, dtypes=self.dtypes) as store: return list(store.csvreader) @property def data(self): return self.read() def __len__(self): return len(self.read()) if self.ondisk() else 0 @classmethod def multiwrite(cls, storepath, dfitems): os.makedirs(os.path.dirname(storepath + '/')) # make a subdir for path, df in dfitems: filename = '{}.{}'.format(path.replace('/', '-'), cls.ext) filepath = os.path.join(storepath, filename) with cls.writer(filepath, dtypes=df.dtypes) as store: store.put(df) return storepath @classmethod def multiread(cls, storepath, dtypes=None): files = deque() for dirpath, dirnames, filenames in os.walk(storepath): for csvfile in filter(lambda name: cls.ext in name, filenames): fullpath = os.path.join(dirpath, csvfile) # sort frames by placing the operator data sets at the end if '-' in csvfile: files.append(fullpath) else: files.appendleft(fullpath) frames = [] for path in files: with cls.reader(path, dtypes=dtypes) as store: frames.append(store.read()) return pd.concat(frames, axis=1) if pd else frames class RingBuffer(object): """A circular buffer interface to a shared `numpy` array """ def __init__(self, dtype, size=2**10): # allocated a shared mem np structured array self._shmarr = shmarray.create(size, dtype=dtype) self._len = len(self._shmarr) # shared current absolute row insertion-index self.ri = mp.Value('i', 0, lock=False) def put(self, row): bi = self.bi # increment row insertion index for the next entry (this means # the last entry is at now at i - 1) self.ri.value += 1 try: self._shmarr[bi] = row except ValueError: # XXX should never happen during production (since it's # means the dtype has been setup wrong) self.ri.value -= 1 def read(self): """Return the contents of the FIFO array without incrementing the start index. """ return self._shmarr[:len(self)] @property def df(self): """The buffer's current contents as a `pd.DataFrame`. """ return pd.DataFrame.from_records(self.read()) def __len__(self): """Current array length up the last inserted data point """ bi = self.bi ri = self.ri l = self._len if not bi: # handles the 1 % 1 == 0 case when l == 1 return bi if ri.value < l else l return bi @property def bi(self): """Current insertion index of in mem frame buffer (i.e. the index where the next value should be inserted) """ return self.ri.value % self._len def is_full(self): return self.bi == 0 and self.ri.value > self._len - 1 class DataStorer(object): """Receive and store row-oriented data points from switchy apps. A shared-memory buffer array is used to store the most recently written data (rows) and is flushed incrementally the to the chosen storage backend. """ def __init__(self, name, dtype, buf_size=2**10, path=None, storetype=None): self.name = name try: self.dtype = pd.np.dtype(dtype) if pd else dtype except TypeError: # set all columns to float64 self.dtype = pd.np.dtype( list(zip(dtype, itertools.repeat('float64'))) ) self.log = utils.get_logger(type(self).__name__) # allocated a shared mem np structured array self._buf_size = buf_size # purely for testing self._buffer = RingBuffer( dtype=self.dtype, size=buf_size) if pd else None # parent proc read-only access to disk store self.storetype = storetype or CSVStore self._storepath = path or tmpfile(self.storetype.ext) self.store = self.storetype(self._storepath, dtypes=self.dtype) self.queue = mp.Queue() self._iput = 0 # queue put counter # disable SIGINT while we spawn signal.signal(signal.SIGINT, signal.SIG_IGN) # setup bg writer self._writer = mp.Process( target=_consume_and_write, args=( self.queue, self._storepath, self.store, self._buffer), name='{}_frame_writer'.format(self.name), ) self._writer.start() # re-enable SIGINT signal.signal(signal.SIGINT, signal.default_int_handler) # kill subproc on exit atexit.register(self.stopwriter) # ensure writer is initialized path = self.queue.get(timeout=3) assert path == self._storepath if pd: @property def buffer(self): """The latest set of buffered data points not yet pushed to disk """ return self._buffer.df @property def data(self): """Copy of the entire data set recorded thus far """ if self.store: return pd.concat( (self.store.data, self.buffer), ignore_index=True ) return self.buffer else: @property def data(self): """Copy of the data points recorded thus far """ with self.store.reader( self.store.path, dtypes=self.dtype ) as reader: return reader.data[1:] def append_row(self, row=None): """Push a row of data onto the consumer queue """ start = time.time() self.queue.put(row) self._iput += 1 diff = time.time() - start if diff > 0.005: # any more then 5ms warn the user self.log.warn("queue.put took '{}' seconds".format(diff)) def stopwriter(self): """Trigger the background frame writer to terminate """ self.queue.put(Terminate, timeout=3) def _consume_and_write(queue, path, store, sharr): """Insert :var:`row` received from the queue into the shared memory array at the current index and increment. Empty rows are always written to disk (keeps stores 'call-index-aligned'). """ proc = mp.current_process() slog = utils.get_logger(proc.name) log = mp.log_to_stderr(slog.getEffectiveLevel()) log.debug("starting storage writer '{}'".format(proc.name)) log.info("storage path is '{}'".format(path)) log.debug("sharr is '{}'".format(sharr)) # set up a new store instance for writing with store.writer(path, dtypes=store.dtypes) as store: # notify parent that file has been created queue.put(path) # handle no pandas/np case buff = store if sharr is None else sharr bufftype = type(buff) log.debug('buffer type is {}'.format(bufftype)) for row in iter(queue.get, Terminate): # consume and process now = time.time() # write frame to disk on buffer fill if sharr and sharr.is_full(): log.debug('writing to {} storage...'.format(store.ext)) try: # push a data frame store.put(pd.DataFrame.from_records(buff.read())) except ValueError: log.error(traceback.format_exc()) log.debug("storage put took '{}'".format(time.time() - now)) try: # push to ring buffer (or store if no pd) buff.put(row) log.debug("{} insert took '{}'".format( bufftype, time.time() - now)) except ValueError: log.error(traceback.format_exc()) log.debug("terminating frame writer '{}'".format(proc.name))