python source code of generic

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from ctypes import *
import ctypes.util
import struct
import sys
import platform
import os
import re
import math
import locale
import encodings
import collections

from savReaderWriter import *
from py3k import *

class Generic(object):
    """
    Class for methods and data used in reading as well as writing
    IBM SPSS Statistics data files
    """

    def __init__(self, savFileName, ioUtf8=False, ioLocale=None):
        """Constructor. Note that interface locale and encoding can only
        be set once"""
        locale.setlocale(locale.LC_ALL, "")
        self.savFileName = savFileName
        self.libc = cdll.LoadLibrary(ctypes.util.find_library("c"))
        self.spssio = self.loadLibrary()

        self.wholeCaseIn = self.spssio.spssWholeCaseIn
        self.wholeCaseOut = self.spssio.spssWholeCaseOut

        self.encoding_and_locale_set = False
        if not self.encoding_and_locale_set:
            self.encoding_and_locale_set = True
            self.ioLocale = ioLocale
            self.ioUtf8 = ioUtf8

    def _encodeFileName(self, fn):
        """Helper function to encode unicode file names into bytestring file
        names encoded in the file system's encoding. Needed for C functions
        that have a c_char_p filename argument.
        http://effbot.org/pyref/sys.getfilesystemencoding.htm
        http://docs.python.org/2/howto/unicode.html under 'unicode filenames'"""
        if not isinstance(fn, unicode):
            return fn
        elif sys.platform.startswith("win"):
            return self.wide2utf8(fn)
        else:
            encoding = sys.getfilesystemencoding()
            encoding = "utf-8" if not encoding else encoding  # actually, ascii
        try:
            return fn.encode(encoding)
        except UnicodeEncodeError:
            msg = ("File system encoding %r can not be used to " +
                   "encode file name %r [%s]")
            raise ValueError(msg % (encoding, fn, sys.exc_info()[1]))

    def _loadLibs(self, folder):
        """Helper function that loads I/O libraries in the correct order"""
        # Get a list of all the files in the spssio dir for a given OS
        # Sort the list in the order in which the libs need to be loaded
        # Using regex patterns ought to be more resilient to updates of the
        # I/O modules, compared to hardcoding the names
        debug = False
        path = os.path.join(os.path.dirname(__file__), "spssio", folder)
        libs = sorted(os.listdir(path))

        pats = ['(lib)?icuda?t', '(lib)?icuuc', '(lib)?icui',
                '(lib)?zlib', '(lib)?spssd?io']
        libs = [lib for pat in pats for lib in libs if re.match(pat, lib)]
        isLib = r"""\w+(\.s[ol](?:\.\d+)*| # linux/hp/solaris
                    \.\d+\.a|              # aix
                    \.dll|                 # windows
                    (\.\d+)*\.dylib)$      # mac""" # filter out non-libs
        libs = [lib for lib in libs if re.match(isLib, lib, re.I | re.X)]
        load = WinDLL if sys.platform.lower().startswith("win") else CDLL
        if libs and debug:
            print(os.path.basename(path).upper().center(79, "-"))
            print("\n".join(libs))

        return [load(os.path.join(path, lib)) for lib in libs][-1]

    def loadLibrary(self):
        """This function loads and returns the SPSSIO libraries,
        depending on the platform."""

        arch = platform.architecture()[0]
        is_32bit, is_64bit = arch == "32bit", arch == "64bit"
        pf = sys.platform.lower()

        # windows
        if pf.startswith("win") and is_32bit:
            spssio = self._loadLibs("win32")
        elif pf.startswith("win"):
            spssio = self._loadLibs("win64")

        # linux
        elif pf.startswith("lin") and is_32bit:
            spssio = self._loadLibs("lin32")
        elif pf.startswith("lin") and is_64bit and os.uname()[-1] == "s390x":
            # zLinux64: Thanks Anderson P. from System z Linux LinkedIn Group!
            spssio = self._loadLibs("zlinux")
        elif pf.startswith("lin") and is_64bit:
            spssio = self._loadLibs("lin64")

        # other
        elif pf.startswith("darwin") or pf.startswith("mac"):
            # Mac: Thanks Rich Sadowsky!
            spssio = self._loadLibs("macos")
        elif pf.startswith("aix") and is_64bit:
            spssio = self._loadLibs("aix64")
        elif pf.startswith("hp-ux"):
            spssio = self._loadLibs("hpux_it")
        elif pf.startswith("sunos") and is_64bit:
            spssio = self._loadLibs("sol64")
        else:
            msg = "Your platform (%r) is not supported" % pf
            raise EnvironmentError(msg)

        return spssio

    def errcheck(self, res, func, args):
        """This function checks for errors during the execution of
        function <func>"""
        if not res:
            msg = "Error performing %r operation on file %r."
            raise IOError(msg % (func.__name__, self.savFileName))
        return res

    def wide2utf8(self, fn):
        """Take a unicode file name string and encode it to a multibyte string
        that Windows can use to represent file names (CP65001, UTF-8)
        http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130"""

        from ctypes import wintypes

        _CP_UTF8 = 65001
        _CP_ACP = 0  # ANSI
        _LPBOOL = POINTER(c_long)

        _wideCharToMultiByte = windll.kernel32.WideCharToMultiByte
        _wideCharToMultiByte.restype = c_int
        _wideCharToMultiByte.argtypes = [wintypes.UINT, wintypes.DWORD,
                                         wintypes.LPCWSTR, c_int,
                                         wintypes.LPSTR, c_int,
                                         wintypes.LPCSTR, _LPBOOL]
        codePage = _CP_UTF8
        dwFlags = 0
        lpWideCharStr = fn
        cchWideChar = len(fn)
        lpMultiByteStr = None
        cbMultiByte = 0  # zero requests size
        lpDefaultChar = None
        lpUsedDefaultChar = None

        # get size
        mbcssize = _wideCharToMultiByte(
        codePage, dwFlags, lpWideCharStr, cchWideChar, lpMultiByteStr,
        cbMultiByte, lpDefaultChar, lpUsedDefaultChar)
        if mbcssize <= 0:
            raise WinError(mbcssize)
        lpMultiByteStr = create_string_buffer(mbcssize)

        # convert
        retcode = _wideCharToMultiByte(
        codePage, dwFlags, lpWideCharStr, cchWideChar, lpMultiByteStr,
        mbcssize, lpDefaultChar, lpUsedDefaultChar)
        if retcode <= 0:
            raise WinError(retcode)
        return lpMultiByteStr.value

    def openSavFile(self, savFileName, mode=b"rb", refSavFileName=None):
        """This function opens IBM SPSS Statistics data files in mode <mode>
        and returns a handle that  should be used for subsequent operations on
        the file. If <savFileName> is opened in mode "cp", meta data
        information aka the spss dictionary is copied from <refSavFileName>"""
        savFileName = os.path.abspath(savFileName)  # fdopen wants full name
        try:
            fdopen = self.libc._fdopen  # Windows
        except AttributeError:
            fdopen = self.libc.fdopen   # Linux and others
        fdopen.argtypes, fdopen.restype = [c_int, c_char_p], c_void_p
        fdopen.errcheck = self.errcheck
        mode_ = b"wb" if mode == b"cp" else mode
        with open(savFileName, mode_.decode("utf-8")) as f:
            self.fd = fdopen(f.fileno(), mode_)
        if mode == b"rb":
            spssOpen = self.spssio.spssOpenRead
        elif mode == b"wb":
            spssOpen = self.spssio.spssOpenWrite
        elif mode == b"cp":
            spssOpen = self.spssio.spssOpenWriteCopy
        elif mode == b"ab":
            spssOpen = self.spssio.spssOpenAppend

        savFileName = self._encodeFileName(savFileName)
        refSavFileName = self._encodeFileName(refSavFileName)
        sav = c_char_py3k(savFileName)
        fh = c_int(self.fd)
        if mode == b"cp":
            retcode = spssOpen(sav, c_char_py3k(refSavFileName), pointer(fh))
        else:
            retcode = spssOpen(sav, pointer(fh))

        msg = "Problem opening file %r in mode %r" % (savFileName, mode)
        checkErrsWarns(msg, retcode)
        return fh.value

    def closeSavFile(self, fh, mode=b"rb"):
        """This function closes the sav file associated with <fh> that was open
        in mode <mode>."""
        if mode == b"rb":
            spssClose = self.spssio.spssCloseRead
        elif mode in (b"wb", b"cp"):
            spssClose = self.spssio.spssCloseWrite
        elif mode == b"ab":
            spssClose = self.spssio.spssCloseAppend
        else:
            spssClose, retcode = None, 9999

        if spssClose is not None:
            retcode = spssClose(c_int(fh))
        msg = "Problem closing file in mode %r" % mode
        checkErrsWarns(msg, retcode)
        #self.libc.close(self.fd)  ???

    @property
    def releaseInfo(self):
        """This function reports release- and machine-specific information
        about the open file."""
        relInfo = ["release number", "release subnumber", "fixpack number",
                   "machine code", "floating-point representation code",
                   "compression scheme code", "big/little-endian code",
                   "character representation code"]
        relInfoArr = (c_int * len(relInfo))()
        retcode = self.spssio.spssGetReleaseInfo(c_int(self.fh), relInfoArr)
        checkErrsWarns("Problem getting ReleaseInfo", retcode)
        info = dict([(item, relInfoArr[i]) for i, item in enumerate(relInfo)])
        return info

    @property
    def spssVersion(self):
        """Return the SPSS version that was used to create the opened file
        as a three-tuple indicating major, minor, and fixpack version as
        ints. NB: in the transition from SPSS to IBM, a new four-digit
        versioning nomenclature is used. This function returns the old
        three-digit nomenclature. Therefore, no patch version information
        is available."""
        info = self.releaseInfo
        major = info["release number"]
        minor = info["release subnumber"]
        fixpack = info["fixpack number"]
        ver_info = (major, minor, fixpack)
        return collections.namedtuple("ver", "major minor fixpack")(*ver_info)

    @property
    def fileCompression(self):
        """Get/Set the file compression.
        Returns/Takes a compression switch which may be any of the following:
        'uncompressed', 'standard', or 'zlib'. Zlib comression requires SPSS
        v21 I/O files."""
        compression = {0: b"uncompressed", 1: b"standard", 2: b"zlib"}
        compSwitch = c_int()
        func = self.spssio.spssGetCompression
        retcode = func(c_int(self.fh), byref(compSwitch))
        checkErrsWarns("Problem getting file compression", retcode)
        return compression.get(compSwitch.value)

    @fileCompression.setter
    def fileCompression(self, compSwitch):
        compression = {b"uncompressed": 0, b"standard": 1, b"zlib": 2}
        compSwitch = compression.get(compSwitch)
        func = self.spssio.spssSetCompression
        retcode = func(c_int(self.fh), c_int(compSwitch))
        invalidSwitch = retcodes.get(retcode) == 'SPSS_INVALID_COMPSW'
        if invalidSwitch and self.spssVersion[0] < 21:
            msg = "Writing zcompressed files requires >=v21 SPSS I/O libraries"
            raise ValueError(msg)
        checkErrsWarns("Problem setting file compression", retcode)

    @property
    def systemString(self):
        """This function returns the name of the system under which the file
        was created aa a string."""
        sysName = create_string_buffer(42)
        func = self.spssio.spssGetSystemString
        retcode = func(c_int(self.fh), byref(sysName))
        checkErrsWarns("Problem getting SystemString", retcode)
        return sysName.value

    def getStruct(self, varTypes, varNames, mode=b"rb"):
        """This function returns a compiled struct object. The required
        struct format string for the conversion between C and Python
        is created on the basis of varType and byte order.
        --varTypes: SPSS data files have either 8-byte doubles/floats or n-byte
          chars[]/ strings, where n is always 8 bytes or a multiple thereof.
        --byte order: files are written in the byte order of the host system
          (mode="wb") and read/appended using the byte order information
          contained in the SPSS data file (mode is "ab" or "rb" or "cp")"""
        if mode in (b"ab", b"rb", b"cp"):   # derive endianness from file
            endianness = self.releaseInfo["big/little-endian code"]
            endianness = ">" if endianness > 0 else "<"
        elif mode == b"wb":                 # derive endianness from host
            if sys.byteorder == "little":
                endianness = "<"
            elif sys.byteorder == "big":
                endianness = ">"
            else:
                endianness = "@"
        structFmt = [endianness]
        ceil = math.ceil
        for varName in varNames:
            varType = varTypes[varName]
            if varType == 0:
                structFmt.append("d")
            else:
                fmt = str(int(ceil(int(varType) / 8.0) * 8))
                structFmt.append(fmt + "s")
        return struct.Struct("".join(structFmt))

    def getCaseBuffer(self):
        """This function returns a buffer and a pointer to that buffer. A whole
        case will be read into this buffer."""
        caseSize = c_long()
        retcode = self.spssio.spssGetCaseSize(c_int(self.fh), byref(caseSize))
        caseBuffer = create_string_buffer(caseSize.value)
        checkErrsWarns("Problem getting case buffer", retcode)
        return caseBuffer

    @property
    def sysmis(self):
        """This function returns the IBM SPSS Statistics system-missing
        value ($SYSMIS) for the host system (also called 'NA' in other
        systems)."""
        try:
            sysmis = -1 * sys.float_info[0]  # Python 2.6 and higher.
        except AttributeError:
            self.spssio.spssSysmisVal.restype = c_float
            sysmis = self.spssio.spssSysmisVal()
        return sysmis

    @property
    def missingValuesLowHigh(self):
        """This function returns the 'lowest' and 'highest' values used for
        numeric missing value ranges on the host system. This can be used in
        a similar way as the LO and HI keywords in missing values
        specifications (cf. MISSING VALUES foo (LO THRU 0). It may be called
        at any time."""
        lowest, highest = c_double(), c_double()
        func = self.spssio.spssLowHighVal
        retcode = func(byref(lowest), byref(highest))
        checkErrsWarns("Problem getting min/max missing values", retcode)
        ranges = (lowest.value, highest.value)
        return collections.namedtuple("range", "lo hi")(*ranges)

    @property
    def ioLocale(self):
        """This function gets/sets the I/O Module's locale.
        This corresponds with the SPSS command SET LOCALE. The I/O Module's
        locale is separate from that of the client application. The
        <localeName> parameter and the return value are identical to those
        for the C run-time function setlocale. The exact locale name
        specification depends on the OS of the host sytem, but has the
        following form:
                   <lang>_<territory>.<codeset>[@<modifiers>]
        The 'codeset' and 'modifier' components are optional and in Windows,
        aliases (e.g. 'english') may be used. When the I/O Module is first
        loaded, its locale is set to the system default. See also:
        --https://wiki.archlinux.org/index.php/Locale
        --http://msdn.microsoft.com/en-us/library/39cwe7zf(v=vs.80).aspx"""
        if hasattr(self, "setLocale"):
            return self.setLocale
        else:
            currLocale = ".".join(locale.getlocale())
            print("NOTE. Locale not set; getting current locale: %s" % currLocale)
            return currLocale

    @ioLocale.setter
    def ioLocale(self, localeName=""):
        if not localeName:
            localeName = ".".join(locale.getlocale())
        func = self.spssio.spssSetLocale
        func.restype = c_char_p
        self.setLocale = func(c_int(locale.LC_ALL), c_char_py3k(localeName))
        if self.setLocale is None:
            raise ValueError("Invalid ioLocale: %r" % localeName)
        return self.setLocale

    @property
    def fileCodePage(self):
        """This function provides the Windows code page number of the encoding
        applicable to a file."""
        nCodePage = c_int()
        func = self.spssio.spssGetFileCodePage
        retcode = func(c_int(self.fh), byref(nCodePage))
        checkErrsWarns("Problem getting file codepage", retcode)
        return nCodePage.value

    def isCompatibleEncoding(self):
        """This function determines whether the file and interface encoding
        are compatible."""
        try:
            # Windows, note typo 'Endoding'!
            func = self.spssio.spssIsCompatibleEndoding
        except AttributeError:
            func = self.spssio.spssIsCompatibleEncoding
        func.restype = c_bool
        isCompatible = c_int()
        retcode = func(c_int(self.fh), byref(isCompatible))
        msg = "Error testing encoding compatibility: %r" % isCompatible.value
        checkErrsWarns(msg, retcode)
        if not isCompatible.value and not self.ioUtf8:
            msg = ("NOTE. SPSS Statistics data file %r is written in a " +
                   "character encoding (%s) incompatible with the current " +
                   "ioLocale setting. It may not be readable. Consider " +
                   "changing ioLocale or setting ioUtf8=True.")
            print(msg % (self.savFileName, self.fileEncoding))
        return bool(isCompatible.value)

    @property
    def ioUtf8(self):
        """This function returns/sets the current interface encoding.
        ioUtf8 = False --> CODEPAGE mode,
        ioUtf8 = True --> UTF-8 mode, aka. Unicode mode
        This corresponds with the SPSS command SHOW UNICODE (getter)
        and SET UNICODE=ON/OFF (setter)."""
        if hasattr(self, "ioUtf8_"):
            return self.ioUtf8_
        self.ioUtf8_ = self.spssio.spssGetInterfaceEncoding()
        return bool(self.ioUtf8_)

    @ioUtf8.setter
    def ioUtf8(self, ioUtf8):
        try:
            retcode = self.spssio.spssSetInterfaceEncoding(c_int(int(ioUtf8)))
            if retcode > 0 and not self.encoding_and_locale_set:
                # not self.encoding_and_locale_set --> nested context managers
                raise SPSSIOError("Error setting IO interface", retcode)
        except TypeError:
            msg = "Invalid interface encoding: %r (must be bool)" % ioUtf8
            raise Exception(msg)
        if retcode < 0:
            checkErrsWarns("Problem setting ioUtf8", retcode)

    @property
    def fileEncoding(self):
        """This function obtains the encoding applicable to a file.
        The encoding is returned as an IANA encoding name, such as
        ISO-8859-1, which is then converted to the corresponding Python
        codec name. If the file contains no file encoding, the locale's
        preferred encoding is returned"""
        try:
            pszEncoding = create_string_buffer(20)  # is 20 enough??
            func = self.spssio.spssGetFileEncoding
            retcode = func(c_int(self.fh), byref(pszEncoding))
            checkErrsWarns("Problem getting file encoding", retcode)
            iana_codes = encodings.aliases.aliases
            rawEncoding = pszEncoding.value.lower().decode("utf-8")
            if rawEncoding.replace("-", "") in iana_codes:
                iana_code = rawEncoding.replace("-", "")
            else:
                iana_code = rawEncoding.replace("-", "_")
            fileEncoding = iana_codes[iana_code]
            return fileEncoding
        except KeyError:
            print ("NOTE. IANA coding lookup error. Code %r " % iana_code +
                   "does not map to any Python codec.")
            return locale.getpreferredencoding()

    @property
    def record(self):
        """Get/Set a whole record from/to a pre-allocated buffer"""
        args = c_int(self.fh), byref(self.caseBuffer)
        retcode = self.wholeCaseIn(*args)
        if retcode:
            checkErrsWarns("Problem reading row", retcode)
        record = list(self.unpack_from(self.caseBuffer))
        return record

    @record.setter
    def record(self, record):
        try:
            self.pack_into(self.caseBuffer, 0, *record)
        except struct.error:
            msg = "Use ioUtf8=True to write unicode strings [%s]"
            raise TypeError(msg % sys.exc_info()[1])
        args = c_int(self.fh), c_char_py3k(self.caseBuffer.raw)
        retcode = self.wholeCaseOut(*args)
        if retcode:
            checkErrsWarns("Problem writing row\n" + record, retcode)

    def printPctProgress(self, nominator, denominator):
        """This function prints the % progress when reading and writing
        files"""
        if nominator and nominator % 10**4 == 0:
            pctProgress = (float(nominator) / denominator) * 100
            print("%2.1f%%... " % pctProgress),