python source code of climatology2

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
climatology2.py

Compute a multi-epoch (multi-day) climatology from daily SST Level-3 grids.

Simple code to be run on Spark cluster, or using multi-core parallelism on single machine.

"""

import sys, os, urlparse, urllib, re, time
import numpy as N
import matplotlib
matplotlib.use('Agg')
import matplotlib.pylab as M

from variables import getVariables, close
from cache import retrieveFile, CachePath
from split import fixedSplit, splitByNDays
from netCDF4 import Dataset, default_fillvals
from pathos.multiprocessing import ProcessingPool as Pool
from plotlib import imageMap, makeMovie

from spatialFilter import spatialFilter
from gaussInterp import gaussInterp         # calls into Fortran version gaussInterp_f.so
#from gaussInterp_slow import gaussInterp_slow as gaussInterp       # pure python, slow debuggable version

#import pyximport; pyximport.install(pyimport=False)
#from gaussInterp import gaussInterp, gaussInterp_      # attempted cython version, had issues

VERBOSE = 1

# Possible execution modes
# Multicore & cluster modes use pathos pool.map(); Spark mode uses PySpark cluster.
ExecutionModes = ['sequential', 'multicore', 'cluster', 'spark']

# SST L3m 4.6km Metadata
# SST calues are scaled integers in degrees Celsius, lon/lat is 8640 x 4320
# Variable = 'sst', Mask = 'qual_sst', Coordinates = ['lon', 'lat']

# Generate algorithmic name for N-day Climatology product
SSTClimatologyTemplate = 'SST.L3.Global.Clim.%(period)s.%(date)s.%(version)s.nc'  #??

# Simple mask and average functions to get us started, then add gaussian interpolation.
# MODIS L3 SST product, qual_sst is [-1, 2] - =0 is best data, can add =1 for better coverage
#def qcMask(var, mask): return N.ma.array(var, mask=N.ma.make_mask(mask == 0))

def qcMask(var, mask): return N.ma.masked_where(mask != 0, var)
#def qcMask(var, mask): return N.ma.masked_where(mask < 0, var)

def splitModisSst(seq, n):
    for chunk in splitByNDays(seq, n, re.compile(r'(...).L3m'), keyed=False):
        yield chunk

def mean(a): return N.ma.mean(a, axis=0)

AveragingFunctions = {'pixelMean': mean, 'gaussInterp': gaussInterp, 'spatialFilter': spatialFilter}

PixelMeanConfig = {'name': 'pixelMean'}

GaussInterpConfig = {'name': 'gaussInterp',
                     'latGrid': None, 'lonGrid': None,         # None means use input lat/lon grid
                     'wlat': 3, 'wlon': 3,
                     'slat': 0.15, 'slon': 0.15, 'stime': 1,
                     'vfactor': -0.6931, 'missingValue': default_fillvals['f4']}

GaussInterpConfig1a = {'name': 'gaussInterp',
                     'latGrid': None, 'lonGrid': None,         # None means use input lat/lon grid
                     'wlat': 0.30, 'wlon': 0.30,
                     'slat': 0.15, 'slon': 0.15, 'stime': 1,
                     'vfactor': -0.6931, 'missingValue': default_fillvals['f4']}

GaussInterpConfig1b = {'name': 'gaussInterp',
                     'latGrid': None, 'lonGrid': None,         # None means use input lat/lon grid
                     'wlat': 0.08, 'wlon': 0.08,
                     'slat': 0.15, 'slon': 0.15, 'stime': 1,
                     'vfactor': -0.6931, 'missingValue': default_fillvals['f4']}

GaussInterpConfig2 = {'name': 'gaussInterp',
                     'latGrid': (89.5, -89.5, -0.25), 'lonGrid': (-180., 179., 0.25),
                     'wlat': 2., 'wlon': 2.,
                     'slat': 0.15, 'slon': 0.15, 'stime': 1,
                     'vfactor': -0.6931, 'missingValue': default_fillvals['f4']}

FilterGaussian = [[1, 2, 1], [2, 4, 2], [1, 2, 1]]    # divide by 16
FilterLowPass = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]     # divide by 9

SpatialFilterConfig1 = {'name': 'spatialFilter', 'normalization': 16., 
                        'spatialFilter': N.array(FilterGaussian, dtype=N.int32),
                        'missingValue': default_fillvals['f4']}

SpatialFilterConfig2 = {'name': 'spatialFilter', 'normalization': 9.,
                        'spatialFilter': N.array(FilterLowPass, dtype=N.int32),
                        'missingValue': default_fillvals['f4']}



def climByAveragingPeriods(urls,              # list of (daily) granule URLs for a long time period (e.g. a year)
                    nEpochs,                  # compute a climatology for every N epochs (days) by 'averaging'
                    nWindow,                  # number of epochs in window needed for averaging
                    nNeighbors,               # number of neighbors on EACH side in lat/lon directions to use in averaging
                    variable,                 # name of primary variable in file
                    mask,                     # name of mask variable
                    coordinates,              # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon')
                    splitFn=splitModisSst,    # split function to use to partition the input URL list
                    maskFn=qcMask,            # mask function to compute mask from mask variable
                    averager='pixelMean',     # averaging function to use, one of ['pixelMean', 'gaussInterp', 'spatialFilter']
                    averagingConfig={},       # dict of parameters to control the averaging function (e.g. gaussInterp)
                    optimization='fortran',   # optimization mode (fortran or cython)
                    mode='sequential',        # Map across time periods of N-days for concurrent work, executed by:
                                              # 'sequential' map, 'multicore' using pool.map(), 'cluster' using pathos pool.map(),
                                              # or 'spark' using PySpark
                    numNodes=1,               # number of cluster nodes to use
                    nWorkers=4,               # number of parallel workers per node
                    averagingFunctions=AveragingFunctions,    # dict of possible averaging functions
                    legalModes=ExecutionModes,  # list of possible execution modes
                    cachePath=CachePath       # directory to cache retrieved files in
                   ):
    '''Compute a climatology every N days by applying a mask and averaging function.
Writes the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary.
***Assumption:  This routine assumes that the N grids will fit in memory.***
    '''
    if averagingConfig['name'] == 'gaussInterp':
        averagingConfig['wlat'] = nNeighbors
        averagingConfig['wlon'] = nNeighbors
    try:
        averageFn = averagingFunctions[averager]
    except:
        print >>sys.stderr, 'climatology: Error, Averaging function must be one of: %s' % str(averagingFunctions)
        sys.exit(1)

    urlSplits = [s for s in splitFn(urls, nEpochs)]

    def climsContoured(urls, plot=None, fillValue=default_fillvals['f4'], format='NETCDF4', cachePath=cachePath):
        n = len(urls)
        if VERBOSE: print >>sys.stderr, urls

        var = climByAveraging(urls, variable, mask, coordinates, maskFn, averageFn, averagingConfig, optimization, cachePath)

        fn = os.path.split(urls[0])[1]
        inFile = os.path.join(cachePath, fn)
        method = averagingConfig['name']
        fn = os.path.splitext(fn)[0]
        day = fn[5:8]
        nDays = int(var['time'][0])

        if 'wlat' in averagingConfig:
            wlat = averagingConfig['wlat']
        else:
            wlat = 1
        if int(wlat) == wlat:
            outFile = 'A%s.L3m_%dday_clim_sst_4km_%s_%dnbrs.nc' % (day, nDays, method, int(wlat))    # mark each file with first day in period
        else:
            outFile = 'A%s.L3m_%dday_clim_sst_4km_%s_%4.2fnbrs.nc' % (day, nDays, method, wlat)    # mark each file with first day in period

        outFile = writeOutNetcdfVars(var, variable, mask, coordinates, inFile, outFile, fillValue, format)

        if plot == 'contour':
            figFile = contourMap(var, variable, coordinates, n, outFile)
        elif plot == 'histogram':
#            figFile = histogram(var, variable, n, outFile)
            figFile = None
        else:
            figFile = None
        return (outFile, figFile)

    if mode == 'sequential':
        results = map(climsContoured, urlSplits)
    elif mode == 'multicore':
        pool = Pool(nWorkers)
        results = pool.map(climsContoured, urlSplits)        
    elif mode == 'cluster':
        pass
    elif mode == 'spark':
        pass

    return results


def climByAveraging(urls,                    # list of granule URLs for a time period
                    variable,                # name of primary variable in file
                    mask,                    # name of mask variable
                    coordinates,             # names of coordinate arrays to read and pass on (e.g. 'lat' and 'lon')
                    maskFn=qcMask,           # mask function to compute mask from mask variable
                    averageFn=mean,          # averaging function to use
                    averagingConfig={},      # parameters to control averaging function (e.g. gaussInterp)
                    optimization='fortran',  # optimization mode (fortran or cython)
                    cachePath=CachePath
                   ):
    '''Compute a climatology over N arrays by applying a mask and averaging function.
Returns the averaged variable grid, attributes of the primary variable, and the coordinate arrays in a dictionary.
***Assumption:  This routine assumes that the N grids will fit in memory.***
    '''
    n = len(urls)
    varList = [variable, mask]
    var = {}
    vtime = N.zeros((n,), N.int32)

    for i, url in enumerate(urls):
        try:
            path = retrieveFile(url, cachePath)
            fn = os.path.split(path)[1]
            vtime[i] = int(fn[5:8])     # KLUDGE: extract DOY from filename
        except:
            print >>sys.stderr, 'climByAveraging: Error, continuing without file %s' % url
            accum[i] = emptyVar
            continue
        if path is None: continue
        print >>sys.stderr, 'Read variables and mask ...'
        try:
            var, fh = getVariables(path, varList, arrayOnly=True, order='F', set_auto_mask=False)   # return dict of variable objects by name
        except:
            print >>sys.stderr, 'climByAveraging: Error, cannot read file %s' % path
            accum[i] = emptyVar
            continue
        if i == 0:
            dtype = var[variable].dtype
            if 'int' in dtype.name: dtype = N.float32
            shape = (n,) + var[variable].shape
            accum = N.ma.empty(shape, dtype, order='F')
            emptyVar = N.array(N.ma.masked_all(var[variable].shape, dtype), order='F')  # totally masked variable array for missing or bad file reads

            print >>sys.stderr, 'Read coordinates ...'
            var, fh = getVariables(path, coordinates, var, arrayOnly=True, order='F')   # read coordinate arrays and add to dict

        var[variable] = maskFn(var[variable], var[mask])     # apply quality mask variable to get numpy MA, turned off masking done by netCDF4 library
#       var[variable] = var[variable][:]

        # Echo variable range for sanity check
        vals = var[variable].compressed()
        print >>sys.stderr, 'Variable Range: min, max:', vals.min(), vals.max()

        # Plot input grid
#        figFile = histogram(vals, variable, n, os.path.split(path)[1])
#        figFile = contourMap(var, variable, coordinates[1:], n, os.path.split(path)[1])

        accum[i] = var[variable]                        # accumulate N arrays for 'averaging"""
#        if i != 0 and i+1 != n: close(fh)              # REMEMBER: closing fh loses netCDF4 in-memory data structures
        close(fh)

    coordinates = ['time'] + coordinates               # add constructed time (days) as first coordinate
    var['time'] = vtime

    if averagingConfig['name'] == 'pixelMean':
        print >>sys.stderr, 'Doing Pixel Average over %d grids ...' % n
        start = time.time()
        avg = averageFn(accum)                         # call pixel averaging function
        end = time.time()
        print >>sys.stderr, 'pixelMean execution time:', (end - start)
        outlat = var[coordinates[1]].astype(N.float32)[:]
        outlon = var[coordinates[2]].astype(N.float32)[:]
    elif averagingConfig['name'] == 'gaussInterp':
        print >>sys.stderr, 'Doing Gaussian Interpolation over %d grids ...' % n
        var[variable] = accum
        c = averagingConfig
        latGrid = c['latGrid']; lonGrid = c['lonGrid']
        if latGrid is not None and lonGrid is not None:
            outlat = N.arange(latGrid[0], latGrid[1]+latGrid[2], latGrid[2], dtype=N.float32, order='F')
            outlon = N.arange(lonGrid[0], lonGrid[1]+lonGrid[2], lonGrid[2], dtype=N.float32, order='F')
        else:
            outlat = N.array(var[coordinates[1]], dtype=N.float32, order='F')
            outlon = N.array(var[coordinates[2]], dtype=N.float32, order='F')
        varNames = [variable] + coordinates
        start = time.time()
        avg, weight, status = \
            gaussInterp(var, varNames, outlat, outlon, c['wlat'], c['wlon'],
                        c['slat'], c['slon'], c['stime'], c['vfactor'], c['missingValue'],
                        VERBOSE, optimization)
        end = time.time()
        var['outweight'] = weight.astype(N.float32)
        print >>sys.stderr, 'gaussInterp execution time:', (end - start)
    elif averagingConfig['name'] == 'spatialFilter':
        print >>sys.stderr, 'Applying Spatial 3x3 Filter and then averaging over %d grids ...' % n
        var[variable] = accum
        c = averagingConfig
        varNames = [variable] + coordinates
        start = time.time()
        avg, count, status = \
            spatialFilter(var, varNames, c['spatialFilter'], c['normalization'], 
                          c['missingValue'], VERBOSE, optimization)
        end = time.time()
        print >>sys.stderr, 'spatialFilter execution time:', (end - start)
        outlat = var[coordinates[1]].astype(N.float32)[:]
        outlon = var[coordinates[2]].astype(N.float32)[:]

    var['out'+variable] = avg.astype(N.float32)            # return primary variable & mask arrays in dict
    var['out'+mask] = N.ma.getmask(avg)
    var['outlat'] = outlat
    var['outlon'] = outlon
    return var


def writeOutNetcdfVars(var, variable, mask, coordinates, inFile, outFile, fillValue=None, format='NETCDF4'):
    '''Construct output bundle of arrays with NetCDF dimensions and attributes.
Output variables and attributes will have same names as the input file.
    '''
    din = Dataset(inFile, 'r')
    dout = Dataset(outFile, 'w', format=format)
    print >>sys.stderr, 'Writing %s ...' % outFile

    # Transfer global attributes from input file
    for a in din.ncattrs():
        dout.setncattr(a, din.getncattr(a))

    # Add dimensions and variables, copying data
    coordDim = [dout.createDimension(coord, var['out'+coord].shape[0]) for coord in coordinates]    # here output lon, lat, etc.
    for coord in coordinates:
        v = dout.createVariable(coord, var['out'+coord].dtype, (coord,))
        v[:] = var['out'+coord][:]
    primaryVar = dout.createVariable(variable, var['out'+variable].dtype, coordinates, fill_value=fillValue)
    primaryVar[:] = var['out'+variable][:]          # transfer array
    maskVar = dout.createVariable(mask, 'i1', coordinates)
    maskVar[:] = var['out'+mask].astype('i1')[:]

    # Transfer variable attributes from input file
    for k,v in dout.variables.iteritems():
        for a in din.variables[k].ncattrs():
            if a == 'scale_factor' or a == 'add_offset' or a == '_FillValue': continue
            v.setncattr(a, din.variables[k].getncattr(a))
        if k == variable:
            try:
#                if fillValue == None: fillValue = din.variables[k].getncattr('_FillValue')        # total kludge
                if fillValue == None: fillValue = default_fillvals['f4']
#                print >>sys.stderr, default_fillvals
#                v.setncattr('_FillValue', fillValue)         # set proper _FillValue for climatology array
                v.setncattr('missing_value', fillValue)
                print >>sys.stderr, 'Setting missing_value for primary variable %s to %f' % (variable, fillValue)
            except:
                print >>sys.stderr, 'writeOutNetcdfVars: Warning, for variable %s no fill value specified or derivable from inputs.' % variable
    din.close()
    dout.close()
    return outFile
    

def contourMap(var, variable, coordinates, n, outFile):
    figFile = os.path.splitext(outFile)[0] + '_hist.png'
    # TODO: Downscale variable array (SST) before contouring, matplotlib is TOO slow on large arrays
    vals = var[variable][:]

    # Fixed color scale, write file, turn off auto borders, set title, reverse lat direction so monotonically increasing??
    imageMap(var[coordinates[1]][:], var[coordinates[0]][:], var[variable][:],
             vmin=-2., vmax=45., outFile=figFile, autoBorders=False,
             title='%s %d-day Mean from %s' % (variable.upper(), n, outFile))
    print >>sys.stderr, 'Writing contour plot to %s' % figFile
    return figFile


def histogram(vals, variable, n, outFile):
    figFile = os.path.splitext(outFile)[0] + '_hist.png'
    M.clf()
#    M.hist(vals, 47, (-2., 45.))
    M.hist(vals, 94)
    M.xlim(-5, 45)
    M.xlabel('SST in degrees Celsius')
    M.ylim(0, 300000)
    M.ylabel('Count')
    M.title('Histogram of %s %d-day Mean from %s' % (variable.upper(), n, outFile))
    M.show()
    print >>sys.stderr, 'Writing histogram plot to %s' % figFile
    M.savefig(figFile)
    return figFile


def dailyFile2date(path, offset=1):
    '''Convert YYYYDOY string in filename to date.'''
    fn = os.path.split(path)[1]
    year = int(fn[offset:offset+4])
    doy = int(fn[offset+5:offset+8])
    return fn[5:15].replace('.', '/')


def formatRegion(r):
    """Format lat/lon region specifier as string suitable for file name."""
    if isinstance(r, str):
        return r
    else:
        strs = [str(i).replace('-', 'm') for i in r]
        return 'region-%s-%sby%s-%s' % tuple(strs)


def formatGrid(r):
    """Format lat/lon grid resolution specifier as string suitable for file name."""
    if isinstance(r, str):
        return r
    else:
        return str(r[0]) + 'by' + str(r[1])


def main(args):
    nEpochs = int(args[0])
    nWindow = int(args[1])
    nNeighbors = int(args[2])
    averager = args[3]
    optimization = args[4]
    mode = args[5]
    nWorkers = int(args[6])
    urlFile = args[7]
    urls = [s.strip() for s in open(urlFile, 'r')]
    if averager == 'gaussInterp':
        results = climByAveragingPeriods(urls, nEpochs, nWindow, nNeighbors, 'sst', 'qual_sst', ['lat', 'lon'],
                             averager=averager, optimization=optimization, averagingConfig=GaussInterpConfig,
                             mode=mode, nWorkers=nWorkers)
    elif averager == 'spatialFilter':
        results = climByAveragingPeriods(urls, nEpochs, nWindow, nNeighbors, 'sst', 'qual_sst', ['lat', 'lon'],
                             averager=averager, optimization=optimization, averagingConfig=SpatialFilterConfig1,
                             mode=mode, nWorkers=nWorkers)
    elif averager == 'pixelMean':
        results = climByAveragingPeriods(urls, nEpochs, nWindow, nNeighbors, 'sst', 'qual_sst', ['lat', 'lon'],
                             averager=averager, optimization=optimization, averagingConfig=PixelMeanConfig,
                             mode=mode, nWorkers=nWorkers)
    else:
        print >>sys.stderr, 'climatology2: Error, averager must be one of', AveragingFunctions.keys()
        sys.exit(1)
    
    if results[0][1] is not None:
        makeMovie([r[1] for r in results], 'clim.mpg')    
    return results

    
if __name__ == '__main__':
    print main(sys.argv[1:])


# Old Tests:
# python climatology2.py 5 5 0 pixelMean   fortran sequential 1 urls_sst_10days.txt
# python climatology2.py 5 5 3 gaussInterp fortran sequential 1 urls_sst_10days.txt

# python climatology2.py 5 5 0 pixelMean   fortran sequential 1 urls_sst_40days.txt
# python climatology2.py 5 5 0 pixelMean   fortran multicore  8 urls_sst_40days.txt
# python climatology2.py 5 5 3 gaussInterp fortran multicore  8 urls_sst_40days.txt

# Old Production:
# python climatology2.py 5 5 0 pixelMean     fortran multicore  16 urls_sst_2015.txt  >& log &
# python climatology2.py 10 10 0 pixelMean     fortran multicore  16 urls_sst_2015.txt  >& log &
# python climatology2.py 5 5 3 gaussInterp   fortran multicore  16 urls_sst_2015.txt  >& log &


# Tests:
# python climatology2.py 5 5 0 pixelMean   fortran sequential 1 urls_sst_daynight_5days_sorted.txt
# python climatology2.py 5 5 0 pixelMean   fortran multicore  4 urls_sst_daynight_5days_sorted.txt
# python climatology2.py 5 5 3 gaussInterp fortran sequential 1 urls_sst_daynight_5days_sorted.txt
# python climatology2.py 5 5 3 gaussInterp fortran multicore  4 urls_sst_daynight_5days_sorted.txt
# python climatology2.py 5 7 1 spatialFilter fortran sequential 1 urls_sst_daynight_5days_sorted.txt

# Test number of neighbors needed:
# python climatology2.py 5 7 3 gaussInterp fortran multicore  4 urls_sst_daynight_20days_sorted.txt


# Production:
# python climatology2.py 5 7 0 pixelMean fortran multicore  4 urls_sst_daynight_2003_2015_sorted.txt
# python climatology2.py 5 7 3 gaussInterp fortran sequential 1 urls_sst_daynight_2003_2015_sorted.txt
# python climatology2.py 5 7 3 gaussInterp fortran multicore  4 urls_sst_daynight_2003_2015_sorted.txt
# python climatology2.py 5 7 1 spatialFilter fortran multicore  4 urls_sst_daynight_2003_2015_sorted.txt