python source code of LMAarrayFile

from __future__ import absolute_import
from __future__ import print_function
import os, re #, gzip
import numpy as np
import logging
import subprocess

logger = logging.getLogger('FlashAutorunLogger')

from numpy.lib.recfunctions import append_fields

def cat_LMA(filename):
    """ Returns subprocess pipe with LMA data file on stdout """
    if filename.find('http://') >= 0:
        url_copy = subprocess.Popen(['curl', '-s', filename], stdout=subprocess.PIPE)#, stdin=subprocess.PIPE)
        command = ['gunzip', '-c']
        f = subprocess.Popen(command, stdin=url_copy.stdout, stdout=subprocess.PIPE)
        any_input=None
    else:
        if filename.find('.gz') >= 0:
            command = ['gunzip', '-c', filename]
        else:
            command  = ['cat', filename]
        f = subprocess.Popen(command, stdout=subprocess.PIPE)
        any_input = None
    return f, command, any_input


def dec2bin(dec_number):
    if dec_number == 0: return '0'
    return (dec2bin(dec_number >> 1) + str(dec_number % 2))


def countBits(values):
    # bit shifting routines are in numpy 1.4
    from numpy import array, left_shift, right_shift

    v = array(values).astype('uint32')


    # Bit counting for a 32 bit unsigned integer.
    # there is a fully generic version of this method at
    # http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
    # Binary magic numbers method, also found in  in pages 187-188 of Software Optimization Guide for AMD Athlon 64 and Opteron Processors.

    # The C version is:
    # v = v - ((v >> 1) & 0x55555555);                    # reuse input as temporary
    # v = (v & 0x33333333) + ((v >> 2) & 0x33333333);     # temp
    # c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; # count

    fives  = int('0x55555555', base=16)
    threes = int('0x33333333', base=16)
    effs   = int('0xF0F0F0F',  base=16)
    ones   = int('0x1010101',  base=16)

    v = v - ( (right_shift(v, 1)) & fives);                        # reuse input as temporary
    v = (v & threes) + ( (right_shift(v,2)) & threes);             # temp
    c = right_shift(((v + (right_shift(v,4)) & effs) * ones), 24); # count

    return c

def mask_to_int(mask):
    """ Convert object array of mask strings to integers"""
    mask = np.atleast_1d(mask)
    if len(mask.shape) == 0:
        mask_int = np.asarray([], dtype=int)
    else:
        try:
            # mask is a plain integer
            mask_int = np.fromiter((int(v) for v in mask), int)
        except ValueError:
            # mask is a string representing a base-16 (hex) number
            mask_int = np.fromiter((int(v,16) for v in mask), int)
    return mask_int

class LMAdataFile(object):

    def __init__(self, filename, iterator=False):
        """ iterator=True makes self.data an iterator yielding individual event records"""
        self.filename=filename
        self.iterator=iterator
        self.field_names = None
        self.field_formats = None
        self.field_converters = None
        self.header=[]
        self.startmonth=1
        self.startday=1
        self.startyear=0
        self.mask_length = 0
        self.starthour, self.startminute, self.startsecond = 0, 0, 0

        # Could read the mask length here, but that would mean opening/closing the
        #   file twice. Use this if there are issues
        # if self.filename.find('.gz') >= 0:
        #     with gzip.open(filename) as f:
        #         for line_no, line in enumerate(f):
        #             if line.startswith(b'Data format:'):
        #                 self.mask_length = int(line.decode().split(' ')[-1][:-2])
        #                 # print (file_mask_length)
        #                 break
        # f.close()

        # This dictionary is used to match getattr calls for lat, lon, etc.
        #   with column names as indicated in the LMA data file header
        #   To a user of this object, the dictionary keys become attribute
        #   names that she can access.
        #   Add a new entry to the list if the header column names change.
        self.columnNames = {    'time':     ['time (UT sec of day)', 'time'],
                                'lat':      ['lat'],
                                'lon':      ['lon'],
                                'chi2':     ['reduced chi^2', 'chi2'],
                                'power':    ['P(dBW)'],
                                'mask':     ['mask'],
                                'alt':      ['alt(m)', 'alt'],
                                'stations': ['stations', '# of stations contributed'],
                                'charge':   ['charge'],
                                'range':    ['range (km)'],
                                'flash_id': ['flash_id'],
                                'point_id': ['point_id'],
                            }

        self.columnTypes = {
                                'time':      float,
                                'lat':       float,
                                'lon':       float,
                                'chi2':      float,
                                'power':     float,
                                'mask':      'S{0:d}'.format(2),
                                # 'mask':      'S{0:d}'.format(mask_length),
                                # 'mask':      object,
                                # 'mask':      str,
                                'alt':       float,
                                'stations':  int,
                                'charge':    int  ,
                                'range':     float,
                                'flash_id':  int,
                                'point_id':  int,

                            }

        self.columnConverters = {
                                # 'mask':      mask_to_int,
                            }


        if filename: self.read()


    def __getattr__(self, attrname):

        # See __init__ for column names
        try:
            return self.data[attrname]
        except:
            pass

        # If we got here, stations column wasn't in file.
        #   Try getting it from station mask.
        if attrname=='stations':
            stations = self.hexMaskToStationCount()
            # placing self.data in a list due to this bug
            # http://stackoverflow.com/questions/36440557/typeerror-when-appending-fields-to-a-structured-array-of-size-one
            self.data = append_fields([self.data], ('stations',), (stations,))
            return stations

        return None
        #raise AttributeError, attrname

    def hexMaskToStationCount(self, mask=None):
        if mask is None:
            mask=self.mask
        if mask is None: return None

        mask = mask_to_int(mask)
        try:
            stationCount = countBits(mask)
        except:
            logger.warning("Using slow method of getting station counts")
            stationCount = [ dec2bin(onemask).count('1') for onemask in mask]
            #stationCount = map( (lambda onemask: (dec2bin(int(onemask, 16))).count('1') ), mask)

        self.stations = stationCount
        return stationCount

    def get_file_obj(self, notify=True):
        if not (self.filename.find('.dat') >= 0):
            raise IOError("Can only read .dat files")

        if notify:
            logger.info("Loading LMA data from " + self.filename)

        if self.filename.find('.gz') >= 0:
            command = 'gunzip -c ' + self.filename
            thefile = os.popen(command)
            #gzip readlines is slooow ... at least 5x worse.
            #thefile=gzip.GzipFile(self.filename)
        else:
            thefile=open(self.filename, 'r')

        return thefile

    def read(self):


        # if not (self.filename.find('.dat') >= 0):
        #     raise "Can only read .dat files"
        #
        # print "Loading LMA data from " + self.filename
        #
        # if self.filename.find('.gz') >= 0:
        #     command = 'gzcat ' + self.filename
        #     thefile = os.popen(command)
        #     #gzip readlines is slooow ... at least 5x worse.
        #     #thefile=gzip.GzipFile(self.filename)
        # else:
        #     thefile=file(self.filename, 'r')
        thefile = self.get_file_obj()

        #foundDataLine=False
        isDataLine = r"^.*\*+.*data.*\*+.*" #Search for asterisks, data, and asterisks
        matchDataLine = re.compile(isDataLine, re.IGNORECASE)

        # fileLines = thefile.readlines()
        #First read in the header, then the data
        #dataTemp=[]
        # lineIdx = 0
        for lineIdx, line in enumerate(thefile):
            #if not foundDataLine:
            self.header.append(line)

            #Once the ***data*** line is found,
            #   need to treat all lines as data
            if matchDataLine.search(line):
                #print 'Found data'
                #foundDataLine=True
                break
            #else:
            #   dataTemp.append(line.split())

        #Search the header for info on how the data is written
        isColumnHeaderLine = r"^Data:(.*)"
        matchDataFormatLine = re.compile(isColumnHeaderLine, re.IGNORECASE)

        DataFormatLine = r"^Data format: (.*)"
        matchDataFormatLines = re.compile(DataFormatLine, re.IGNORECASE | re.MULTILINE)

        isDataStartTime = r"^Data start time:(.*)"
        matchDataStartTimeLine = re.compile(isDataStartTime, re.IGNORECASE)

        secAnalyzedLine = r"^Number of seconds analyzed:(.*)"
        matchSecAnalyzedLine = re.compile(secAnalyzedLine, re.IGNORECASE | re.MULTILINE)

        for line in self.header:

            startTimeMatch = matchDataStartTimeLine.search(line)
            if startTimeMatch:
                # Looking to deal with something like: " 06/28/04 23:50:00"
                dateAndTime = startTimeMatch.group(1).split()
                self.startmonth, self.startday, self.startyear = [ int(datePart) for datePart in dateAndTime[0].split('/') ]
                self.starthour, self.startminute, self.startsecond = [ int(timePart) for timePart in dateAndTime[1].split(':') ]
                if self.startyear < 1000 and self.startyear > 70:
                    self.startyear += 1900
                else:
                    self.startyear += 2000
            formatMatch=matchDataFormatLine.search(line)
            if formatMatch:
                columns = formatMatch.group(1).split(',')
                columns = [columnName.strip() for columnName in columns]

            secAnalyzedMatch=matchSecAnalyzedLine.search(line)
            if secAnalyzedMatch:
                self.sec_analyzed = int(secAnalyzedMatch.group(1))

            DataFormatMatchs=matchDataFormatLines.search(line)
            if DataFormatMatchs:
                self.mask_length = int(DataFormatMatchs.group(1).split(' ')[-1][:-1])
                self.columnTypes['mask'] = 'S{0:d}'.format(self.mask_length)


        n_columns = len(columns)
        field_names, types = list(range(n_columns)), list(range(n_columns))
        converters = {}
        for column_idx, column in enumerate(columns):
            for field in self.columnNames.keys():
                if column in self.columnNames[field]:
                    field_names[column_idx] = field
                    typecode = self.columnTypes[field]
                    types[column_idx]  = typecode
                    try:
                        converters[column_idx] = self.columnConverters[field]
                    except:
                        pass

        self.field_names   = field_names
        self.field_formats = types
        self.field_converters = converters


        thefile.close()
        thefile = self.get_file_obj(notify=False)


        if not self.iterator:
            # creates a rec-array
            self.data = np.loadtxt(thefile, dtype={'names':field_names, 'formats':types},
                                            converters=converters, skiprows=len(self.header))
            if ('mask' in self.field_names) & ('station' not in self.field_names):
                # Read the stations property, which triggers checking for the
                # adding the column to the data structure
                stations = self.stations
            # Done with file IO
            thefile.close()
        else:
            self._file_object = thefile
            self.data = self._data_record_iterator(skiprows=len(self.header))

        # print "LMA data loaded."

    def _data_record_iterator(self, skiprows=0):
        if ('mask' in self.field_names) & ('station' not in self.field_names):
            calc_stations = True
        else:
            calc_stations = False
        line_count = 0
        for line in self._file_object:
            line_count+=1
            if line_count <= skiprows:
                logger.debug('skipping ' + line[0:20])
                continue
            items = line.split()
            vals = list(map(apply_format, items, self.field_formats))
            record = dict(list(zip(self.field_names, vals)))
            if calc_stations:
                stations = self.hexMaskToStationCount(mask=[record['mask']])
                record['stations'] = stations[0]
            yield record

def apply_format(item, format):
    if format is object: return item
    if (format == 'S4') or (format =='S6'):
        return str(item)
    return format(item)

def runtest():
    lma = LMAdataFile('tmpout/LYLOUT_090329_200000_3600.dat.flash')
    # lma = LMAdataFile('/data/2004/LMA/040526/LYLOUT_040526_230000_0600.dat.gz')
    #lma = LMAdataFile('/Users/ericbruning/othercode/LYLOUT_040526_220000_0600.dat')

    print(lma.startmonth, lma.startday, lma.startyear)
    print(lma.starthour, lma.startminute, lma.startsecond)
    # keys = lma.data.keys()
    # for key in keys:
    #     print key, len(lma.data[key])
    print(lma.lat)
    print(lma.stations)
    print(lma.flash_id)


if __name__ == '__main__':
    runtest()
    # import hotshot
    # from hotshot import stats
    # prof = hotshot.Profile("lmafile_test_profile_array")
    # prof.runcall(runtest)
    # prof.close()
    # s=stats.load("lmafile_test_profile_array")
    # s.sort_stats("time").print_stats()