python source code of testtools

#==============================================================================
#
#  TestTools
#
#  This is a collection of functions that are useful for running the PyReshaper
#  tests on the Yellowstone compute system.
#
#==============================================================================

# Builtin Modules
import os
import glob
import json
import textwrap

# Third-Party Modules
import numpy as np
import Nio

# Package Modules
from pyreshaper import specification


#==============================================================================
# Private Bytesize from Typecode Calculator
#==============================================================================
def _bytesize(tc):
    DTYPE_MAP = {'d': np.float64, 'f': np.float32, 'l': np.long, 'i': np.int32,
                 'h': np.int16, 'b': np.int8, 'S1': np.character}
    return np.dtype(DTYPE_MAP.get(tc, np.float)).itemsize


#==============================================================================
# Private Size from Shape Calculator
#==============================================================================
def _shape2size(shp):
    return 1 if len(shp) < 1 else reduce(lambda x, y: x * y, shp)


#==============================================================================
# Private Bytesize to Unit-string Converter
#==============================================================================
def _nbyte_str(n, exp=0):
    BYTE_UNITS = ['Bytes', 'KB', 'MB', 'GB', 'PB']
    if (n > 1024.):
        return _nbyte_str(n / 1024., exp=exp + 1)
    else:
        if exp < len(BYTE_UNITS):
            units = BYTE_UNITS[exp]
        else:
            n *= 1024.**(exp + 1 - len(BYTE_UNITS))
            units = BYTE_UNITS[-1]
        return '%.4f %s' % (n, units)


#==============================================================================
# Test Database Class
#==============================================================================
class TestDB(object):

    def __init__(self, name='testinfo.json'):
        """
        Initializer

        Parameters:
            name (str): The name of the test database file.  Defaults
                to 'testinfo.json'.

        Raises:
            ValueError: If the test database file cannot be opened and/or
                read.
        """
        # Get the path to the testinfo file
        abs_path = os.path.abspath(name)

        # Try opening and reading the testinfo file
        self._database = {}
        try:
            dbfile = open(abs_path, 'r')
            self._database = dict(json.load(dbfile))
            dbfile.close()
        except:
            err_msg = 'Problem reading and parsing test info file: {0!s}'.format(abs_path)
            raise ValueError(err_msg)

    def getdb(self):
        """
        Return the testing database as a dictionary

        Returns:
            dict: The testing database
        """
        return self._database

    def display(self):
        """
        List the tests in the test database.
        """
        print
        print 'Tests found in the Test Database are:'
        print
        for test_name in self._database:
            print '   {0!s}'.format(test_name)
        return

    def create_specifier(self, test_name, ncfmt='netcdf4c',
                         outdir='', **kwargs):
        """
        Create a Specifier object for the given named test.

        Parameters:
            test_name (str): The string name of the test in the database
                for which to construct the Specifier.
            ncfmt (str): The NetCDF format string to be passed to the
                Specifier.
            outdir (str): An optional path string to be prepended to the
                "output_prefix" argument of the Specifier.  To be used to
                direct output to a different location.  Leave empty if using
                absolute paths in the test's "output_prefix".
            kwargs (dict): A dictionary of additional options to be
                sent to the Specifier.

        Returns:
            Specifier: A Specifier instance with the information to run the
                named test.
        """

        # Check types
        if type(test_name) is not str:
            err_msg = "Test name must be a string"
            raise TypeError(err_msg)
        if type(ncfmt) is not str:
            err_msg = "NetCDF format must be a string"
            raise TypeError(err_msg)
        if type(outdir) is not str:
            err_msg = "Output directory must be a string"
            raise TypeError(err_msg)

        # Check for the given test name
        if test_name not in self._database:
            err_msg = "Test '" + test_name + "' not found in database"
            raise ValueError(err_msg)

        # Define the Specifier input
        input_dir = str(self._database[test_name]['input_dir'])
        infiles = []
        for input_glob in self._database[test_name]['input_globs']:
            input_glob_str = str(input_glob)
            full_input_glob = str(os.path.join(input_dir, input_glob_str))
            infiles.extend(glob.glob(full_input_glob))
        prefix = str(os.path.join(
            outdir, self._database[test_name]['output_prefix']))
        suffix = str(self._database[test_name]['output_suffix'])
        metadata = map(str, self._database[test_name]['metadata'])

        # Remove duplicate parameters from the kwargs dictionary
        kwargs.pop('infiles', None)
        kwargs.pop('prefix', None)
        kwargs.pop('suffix', None)
        kwargs.pop('metadata', None)

        # Return the Specifier
        return specification.Specifier(infiles=infiles, ncfmt=ncfmt,
                                       prefix=prefix, suffix=suffix,
                                       metadata=metadata, **kwargs)


#==============================================================================
# StatDB - Statistics Database
#==============================================================================
class StatDB(object):

    def __init__(self, name=None):
        """
        Initializer

        Parameters:
            name (str): The name of the test statistics file.  Defaults
                to 'teststats.json'.

        Raises:
            ValueError: If the test database file cannot be opened and/or
                read.
        """

        # Initialize test statistics
        self._statistics = {}

        # If the stats filename is given, then try to read it
        if name is not None:
            abs_path = os.path.abspath(name)
            try:
                stfile = open(abs_path, 'r')
                self._statistics = dict(json.load(stfile))
                stfile.close()
            except:
                err_msg = 'Problem reading and parsing test stats file: {0!s}'.format(abs_path)
                raise ValueError(err_msg)

    def getdb(self):
        """
        Return the test analysis statistics as a dictionary

        Returns:
            dict: The statistics database
        """
        return self._statistics

    def analyze(self, database, tests=None, force=False):
        """
        Analyze the test database to determine test statistics

        Parameters:
            database (TestDB): The testing database to analyze
            tests (list): A list of string names of tests in the database
                to analyze.  If None, assume all tests.
            force (bool): Whether to force reanalysis of tests that have
                already been analyzed
        """

        # Check type of testing db object
        if not isinstance(database, TestDB):
            err_msg = "Testing database must be of TestDB type"
            raise TypeError(err_msg)

        # Get the testing database dictionary
        dbdict = database.getdb()

        # Check type
        if tests is not None and not isinstance(tests, (list, tuple)):
            err_msg = "Test name list must be of list or tuple type"
            raise TypeError(err_msg)

        # Assume all tests to be analyzed if None input
        elif tests is None:
            tests = dbdict.keys()

        # Error if tests not in database
        bad_names = [t for t in tests if t not in dbdict]
        if len(bad_names) > 0:
            err_msg = "Tests not found in database: " + ", ".join(bad_names)
            raise ValueError(err_msg)

        # If analysis has already been done, remove those tests
        if not force:
            for test_name in [t for t in tests if t in self._statistics]:
                print "Not Analyzing Test: {0!s}".format(test_name)
            tests = [t for t in tests if t not in self._statistics]

        # Generate statistics for each test
        for test_name in tests:
            print "Analyzing Test: {0!s}".format(test_name)

            # Create a specifier for this test
            spec = database.create_specifier(str(test_name), ncfmt='netcdf')

            # Validate the test information
            spec.validate()

            # Sort the input files by name
            spec.input_file_list.sort()

            # Open the first input file
            infile = Nio.open_file(spec.input_file_list[0], 'r')

            # Get the name of the unlimited dimension (e.g., time)
            tdim = None
            for dim in infile.dimensions:
                if infile.unlimited(dim):
                    tdim = dim
                    continue

            # Add the transverse (i.e., non-time) dimensions and sizes
            xcoords = dict([(v, s) for (v, s) in infile.dimensions.items()
                            if v != tdim])

            # Get the data dimensions
            metadata_names = set(infile.dimensions.keys())

            # Add the extra metadata variable names
            metadata_names.update(set(spec.time_variant_metadata))

            # Gather statistics for variables in dataset
            self._statistics[test_name] = {}
            self._statistics[test_name]['length'] = infile.dimensions[tdim]
            self._statistics[test_name]['variables'] = {}
            for var_name in infile.variables.keys():
                self._statistics[test_name]['variables'][var_name] = {}
                var_obj = infile.variables[var_name]

                tvariant = False
                xshape = var_obj.shape
                if tdim in var_obj.dimensions:
                    xshape = list(xshape)
                    xshape.pop(var_obj.dimensions.index(tdim))
                    xshape = tuple(xshape)
                    tvariant = True
                if not tvariant:
                    metadata_names.add(var_name)
                self._statistics[test_name]['variables'][
                    var_name]['tvariant'] = tvariant
                self._statistics[test_name]['variables'][
                    var_name]['xshape'] = xshape

                xsize = _shape2size(xshape) * _bytesize(var_obj.typecode())
                self._statistics[test_name]['variables'][
                    var_name]['xsize'] = xsize

                if var_name in metadata_names:
                    self._statistics[test_name][
                        'variables'][var_name]['meta'] = True
                else:
                    self._statistics[test_name][
                        'variables'][var_name]['meta'] = False

            # Close the first file
            infile.close()

            # Loop over all input files and compute time-variant variable sizes
            for filename in spec.input_file_list[1:]:

                # Open the file
                infile = Nio.open_file(filename, 'r')

                # And number of time steps to the test data
                self._statistics[test_name]['length'] += \
                    infile.dimensions[tdim]

                # Close the file
                infile.close()

            # Compute self-analysis parameters
            num_steps = self._statistics[test_name]['length']
            var_stats = self._statistics[test_name]['variables']
            tser_vars = [str(v) for (v, s) in var_stats.items()
                         if not s['meta'] and s['tvariant']]
            tvmd_vars = [str(v) for (v, s) in var_stats.items()
                         if s['meta'] and s['tvariant']]
            timd_vars = [str(v) for (v, s) in var_stats.items()
                         if s['meta'] and not s['tvariant']]
            lost_vars = [str(v) for (v, s) in var_stats.items()
                         if not s['meta'] and not s['tvariant']]

            # Store the transverse (to time) coordinate sizes
            self._statistics[test_name]['xcoords'] = xcoords

            # Record the variables names
            self._statistics[test_name]['names'] = {}
            self._statistics[test_name]['names']['tseries'] = tser_vars
            self._statistics[test_name]['names']['tvariant'] = tvmd_vars
            self._statistics[test_name]['names']['tinvariant'] = timd_vars
            self._statistics[test_name]['names']['other'] = lost_vars

            # Compute numbers/counts
            num_tser = len(tser_vars)
            num_tvmd = len(tvmd_vars)
            num_timd = len(timd_vars)
            num_lost = len(lost_vars)
            self._statistics[test_name]['counts'] = {}
            self._statistics[test_name]['counts']['tseries'] = num_tser
            self._statistics[test_name]['counts']['tvariant'] = num_tvmd
            self._statistics[test_name]['counts']['tinvariant'] = num_timd
            self._statistics[test_name]['counts']['other'] = num_lost

            # Compute shapes
            self._statistics[test_name]['xshapes'] = {}
            self._statistics[test_name]['xshapes']['tseries'] = list(
                set([var_stats[v]['xshape'] for v in tser_vars]))
            self._statistics[test_name]['xshapes']['tvariant'] = list(
                set([var_stats[v]['xshape'] for v in tvmd_vars]))
            self._statistics[test_name]['xshapes']['tinvariant'] = list(
                set([var_stats[v]['xshape'] for v in timd_vars]))

            # Compute bytesizes
            self._statistics[test_name]['totalsizes'] = {}
            self._statistics[test_name]['totalsizes']['tseries'] = \
                sum([var_stats[v]['xsize'] for v in tser_vars]) * num_steps
            self._statistics[test_name]['totalsizes']['tvariant'] = \
                sum([var_stats[v]['xsize'] for v in tvmd_vars]) * num_steps
            self._statistics[test_name]['totalsizes']['tinvariant'] = \
                sum([var_stats[v]['xsize'] for v in timd_vars])

            # Compute maxima
            self._statistics[test_name]['maxsizes'] = {}
            maxsize = 0 if num_tser == 0 else \
                max([var_stats[v]['xsize'] for v in tser_vars]) * num_steps
            self._statistics[test_name]['maxsizes']['tseries'] = maxsize
            maxsize = 0 if num_tvmd == 0 else \
                max([var_stats[v]['xsize'] for v in tvmd_vars]) * num_steps
            self._statistics[test_name]['maxsizes']['tvariant'] = maxsize
            maxsize = 0 if num_timd == 0 else \
                max([var_stats[v]['xsize'] for v in timd_vars])
            self._statistics[test_name]['maxsizes']['tinvariant'] = maxsize

    def display(self, tests=None):
        """
        Print the statistics information determined from self analysis

        Parameters:
            tests (list): A list of string names of tests in the database
                to print.  If None, assume all tests.
        """

        # Check type
        if tests is not None and not isinstance(tests, (list, tuple)):
            err_msg = "Test name list must be of list or tuple type"
            raise TypeError(err_msg)

        # Assume all tests to be analyzed if None input
        elif tests is None:
            tests = self._statistics.keys()

        # Error if tests not in database
        bad_names = [t for t in tests if t not in self._statistics]
        if len(bad_names) > 0:
            err_msg = "Tests not found in statistics: " + ", ".join(bad_names)
            raise ValueError(err_msg)

        # Print the statistics information
        for test_name in tests:
            print "Statistics for Test: {0!s}".format(test_name)
            print

            test_stats = self._statistics[test_name]
            num_steps = test_stats['length']

            print "   Number of Time Steps: {0!s}".format(num_steps)
            print

            # Print counts
            num_tser = test_stats['counts']['tseries']
            print "   Number of Time-Series Variables:             {0!s}".format(num_tser)
            num_tvmd = test_stats['counts']['tvariant']
            print "   Number of Time-Variant Metadata Variables:   {0!s}".format(num_tvmd)
            num_timd = test_stats['counts']['tinvariant']
            print "   Number of Time-Invariant Metadata Variables: {0!s}".format(num_timd)
            num_lost = test_stats['counts']['other']
            if num_lost > 0:
                print "   WARNING: {0!s} unclassified variables".format(num_lost)
            print

            # Print the coordinate data
            print "   Transverse Coordinate Shapes:"
            maxlenxcoord = max([len(xc) for xc in test_stats['xcoords']])
            for xcoord, cxsize in test_stats['xcoords'].items():
                spcr = ' ' * (maxlenxcoord - len(xcoord))
                print "       {0!s}:{1!s}{2!s}".format(xcoord, spcr, cxsize)
            print

            # Print names
            print "   Time-Series Variables:"
            vlist = ", ".join([str(v) for v in
                               test_stats['names']['tseries']])
            print "     ", "\n      ".join(textwrap.wrap(vlist))
            print "   Time-Variant Metadata Variables:"
            vlist = ", ".join([str(v) for v in
                               test_stats['names']['tvariant']])
            print "     ", "\n      ".join(textwrap.wrap(vlist))
            print "   Time-Invariant Metadata Variables:"
            vlist = ", ".join([str(v) for v in
                               test_stats['names']['tinvariant']])
            print "     ", "\n      ".join(textwrap.wrap(vlist))
            if num_lost > 0:
                print "   Unclassified Variables (neither meta nor time-variant):"
                vlist = ", ".join([str(v) for v in
                                   test_stats['names']['other']])
                print "     ", "\n      ".join(textwrap.wrap(vlist))
            print

            # Print Transverse Shapes
            print "   Time-Series Variable Transverse Shapes:"
            print "     ", " ".join([str(s) for s in
                                     test_stats['xshapes']['tseries']])
            print "   Time-Variant Metadata Transverse Shapes:"
            print "     ", " ".join([str(s) for s in
                                     test_stats['xshapes']['tvariant']])
            print "   Time-Invariant Metadata Transverse Shapes:"
            print "     ", " ".join([str(s) for s in
                                     test_stats['xshapes']['tinvariant']])
            print

            # Print total bytesizes
            tser_totsize = test_stats['totalsizes']['tseries']
            print "   Time-Series Variable Total Size:    {}".format(_nbyte_str(tser_totsize))
            tvmd_totsize = test_stats['totalsizes']['tvariant']
            print "   Time-Variant Metadata Total Size:   {}".format(_nbyte_str(tvmd_totsize))
            timd_totsize = test_stats['totalsizes']['tinvariant']
            print "   Time-Invariant Metadata Total Size: {}".format(_nbyte_str(timd_totsize))
            print

            # Print maximum bytesizes
            tser_maxsize = test_stats['maxsizes']['tseries']
            print "   Time-Series Variable Max Size:    {}".format(_nbyte_str(tser_maxsize))
            tvmd_maxsize = test_stats['maxsizes']['tvariant']
            print "   Time-Variant Metadata Max Size:   {}".format(_nbyte_str(tvmd_maxsize))
            timd_maxsize = test_stats['maxsizes']['tinvariant']
            print "   Time-Invariant Metadata Max Size: {}".format(_nbyte_str(timd_maxsize))
            print

    def save(self, name="teststats.json"):
        """
        Save the statistics information to a JSON data file

        Parameters:
            name (str): The name of the JSON statistics file to write
        """

        # Check types
        if isinstance(name, str):
            fp = open(name, 'w')
        else:
            err_msg = "Statistics filename must be a string"
            raise TypeError(err_msg)

        # Dump JSON data to file
        try:
            json.dump(self._statistics, fp, sort_keys=True,
                      indent=3, separators=(',', ': '))
        except:
            err_msg = "Failed to write statistics file"
            raise RuntimeError(err_msg)

        # Close the file
        fp.close()


#==============================================================================
# TimeDB - Database for Timing Data
#==============================================================================
class TimeDB(object):

    def __init__(self, name='timings.json'):
        """
        Initializer

        Parameters:
            name (str): The name of the timing database file.  Defaults
                to 'timings.json'.

        Raises:
            ValueError: If the timing database file cannot be opened and/or
                read.
        """
        # See if there is a user-defined timings file, otherwise look for default
        abs_path = os.path.abspath(name)

        # Try opening and reading the testinfo file
        try:
            dbfile = open(abs_path, 'r')
            self._timings = dict(json.load(dbfile))
            dbfile.close()
        except:
            print 'Timings file does not exist.  Creating a new timings database.'
            self._timings = {}

    def getdb(self):
        """
        Return the timings database as a dictionary

        Returns:
            dict: The timings database dictionary
        """
        return self._timings

    def test_has_method(self, test, method):
        """
        Check if given test was done with the given method

        Parameters:
            test (str): The name of the test to query
            method (str): The name of the method to query

        Returns:
            bool: True, if the test was performed with the given method,
                False, otherwise.
        """

        # checking
        if test not in self._timings:
            err_msg = "Given test '{0!s}' not in timings database".format(test)
            raise ValueError(err_msg)

        # Start looking for the method
        if 'results' not in self._timings[test]:
            return False
        return method in self._timings[test]['results']

    def tests_with_methods(self, methods=None):
        """
        Return the list of tests that use the given methods

        If no methods are given, then returns all tests

        Parameters:
            methods (list, tuple): The list of methods to query
        """
        if not methods:
            return [str(t) for t in self._timings.keys()]

        test_set = set()
        for method in methods:
            test_set.update(set(str(t) for t in self._timings
                                if self.test_has_method(t, method)))
        return list(test_set)

    def methods_in_tests(self, tests=None):
        """
        Return the list of methods that are used by all of the given tests

        If no tests given, return list of all methods found.

        Parameters:
            tests (list, tuple): The list of tests to query
        """
        if tests:
            tests_to_search = tests
        else:
            tests_to_search = self._timings.keys()

        method_set = set()
        for test in tests_to_search:
            if 'results' in self._timings[test]:
                method_set.update(
                    set(str(t) for t in self._timings[test]['results'].keys()))
        return list(method_set)

    def display_tests(self, methods=None):
        """
        List the tests in the test database.

        If methods are given, then only the tests using these methods will be
        displayed.

        Parameters:
            methods (list, tuple): A method names to query
        """
        print
        print "Tests",
        if methods:
            print "with methods {0!s}".format(methods),
        print "in the Timings Database:"
        print
        for test in self.tests_with_methods(methods):
            print '   {0!s}'.format(test)

    def display_methods(self, tests=None):
        """
        List the methods in the test database.

        If tests are given, then only the methods used by these tests will be
        displayed.

        Parameters:
            tests (list, tuple): A test names to query
        """
        print
        print "Methods",
        if tests:
            print "used by tests {0!s}".format(tests),
        print "in the Timings Database:"
        print
        for method in self.methods_in_tests(tests):
            print '   {0!s}'.format(method)

    def add_result(self, test, method, job,
                   tser_read=0.0, tim_read=0.0, tvm_read=0.0,
                   tser_write=0.0, tim_write=0.0, tvm_write=0.0,
                   metadata=True, once=False, cores=1, nodes=0,
                   input_open=0.0, output_open=0.0, total=0.0,
                   actual_mb=0.0, requested_mb=0.0, system='yellowstone'):
        """
        Add new timing results to the timings database

        All times are in seconds.

        Parameters:
            test (str): Name of the test associated with the result
            method (str): Name of the method used by the test result
            job (str): Individual job ID string to associate with the result
            tser_read (float): Time to read Time-Series data
            tim_read (float): Time to read Time-Invariant Metadata (TIM)
            tvm_read (float): Time to read Time-Variant Metadata (TVM)
            tser_write (float): Time to write Time-Series data
            tim_write (float): Time to write Time-Invariant Metadata (TIM)
            tvm_write (float): Time to write Time-Variant Metadata (TVM)
            metadata (bool): Whether all metadata was written
            once (bool): Whether metadata was written to a "once" file
            cores (int): Number of cores used for the job
            nodes (int): Number of nodes used for the job
            input_open (float): Time to open all input files
            output_open (float): Time to open all output files
            total (float): Total time for the entire conversion process
            actual_mb (float): Number of MB actually read (assuming a given
                block size) from input files
            requested_mb (float): Number of MB requested
            system (str): Name of the system on which the test was run
        """

        if test not in self._timings:
            self._timings[test] = {'results': {}}
        if method not in self._timings[test]['results']:
            self._timings[test]['results'][method] = {}

        if job not in self._timings[test]['results'][method]:
            self._timings[test]['results'][method][job] = {}
            self._timings[test]['results'][method][job]['sys'] = system
            self._timings[test]['results'][method][job]['cores'] = cores
            self._timings[test]['results'][method][job]['nodes'] = nodes
            self._timings[test]['results'][method][job]['real'] = total
            self._timings[test]['results'][method][job]['metadata'] = metadata
            self._timings[test]['results'][method][job]['once'] = once
            self._timings[test]['results'][method][job]['actual'] = actual_mb
            self._timings[test]['results'][method][
                job]['request'] = requested_mb
            self._timings[test]['results'][method][job]['openi'] = input_open
            self._timings[test]['results'][method][job]['openo'] = output_open
            self._timings[test]['results'][method][job]['metaTIr'] = tim_read
            self._timings[test]['results'][method][job]['metaTVr'] = tvm_read
            self._timings[test]['results'][method][job]['TSr'] = tser_read
            self._timings[test]['results'][method][job]['metaTIw'] = tim_write
            self._timings[test]['results'][method][job]['metaTVw'] = tvm_write
            self._timings[test]['results'][method][job]['TSw'] = tser_write

    def get_results(self, test, method):
        """
        Get timings results as a dictionary for a given test and method.

        Parameters:
            test (str): Name of the test
            method (str); Name of the method
        """
        if test not in self._timings:
            err_msg = "Test {0!s} not in timings database".format(test)
            raise KeyError(err_msg)
        if self.test_has_method(test, method):
            return self._timings[test]['results'][method]
        else:
            err_msg = "Method {0!s} not in test {1!s} database".format(
                method, test)
            raise KeyError(err_msg)

    def save(self, name="timings.json"):
        """
        Save the timing information to a JSON data file

        Parameters:
            name (str): The name of the JSON timings file to write
        """

        # Check types
        if isinstance(name, str):
            fp = open(name, 'w')
        else:
            err_msg = "Statistics filename must be a string"
            raise TypeError(err_msg)

        # Dump JSON data to file
        try:
            json.dump(self._timings, fp, sort_keys=True,
                      indent=3, separators=(',', ': '))
        except:
            err_msg = "Failed to write statistics file"
            raise RuntimeError(err_msg)

        # Close the file
        fp.close()