python source code of pUtil

import sys, os, signal, time, shutil, cgi
import commands, re
import urllib
import json
import traceback

from xml.dom import minidom
from xml.dom.minidom import Document
from xml.dom.minidom import parse, parseString
from xml.etree import ElementTree

# Initialize the configuration singleton
import environment
env = environment.set_environment()

from processes import killProcesses

# exit code
EC_Failed = 255

try:
    import datetime
except:
    pass

try:
    from PilotErrors import PilotErrors
except:
    pass

# Functions to serialize ARGO messages

def serialize(obj):
    return json.dumps(obj,sort_keys=True,indent=2, separators=(',', ': '))

def deserialize(text):
    return json.loads(text)

def convert_unicode_string(unicode_string):
    if unicode_string is not None:
        return str(unicode_string)
    return None

# all files that need to be copied to the workdir
#fileList = commands.getoutput('ls *.py').split()
def getFileList(path_dir=None):
    try:
        if path_dir is None:
            path_dir = env['pilot_initdir']
        file_list = filter(lambda x: x.endswith('.py'), os.listdir(path_dir))
        file_list.append('PILOTVERSION')
        file_list.append('saga')
        file_list.append('radical')
        file_list.append('HPC')
        file_list.append('movers')
        tolog("Copying: %s" % str(file_list))
        return file_list
    except KeyError:
        return []

# default pilot log files
pilotlogFilename = "pilotlog.txt"
essentialPilotlogFilename = "pilotlog-essential.txt"
pilotstderrFilename = "pilot.stderr"

def setPilotlogFilename(filename):
    """ Set the pilot log file name """

    global pilotlogFilename, essentialPilotlogFilename

    if len(filename) > 0:
        pilotlogFilename = filename

        # Add the essential sub string
        base = pilotlogFilename[:pilotlogFilename.find('.')] # pilotlog.txt -> pilotlog
        essentialPilotlogFilename = pilotlogFilename.replace(base, base+'-essential')

def getPilotlogFilename():
    """ Return the pilot log file name """

    return pilotlogFilename

def setPilotstderrFilename(filename):
    """ Set the pilot stderr file name"""

    global pilotstderrFilename
    if len(filename) > 0:
        pilotstderrFilename = filename

def getPilotstderrFilename():
    """ return the pilot stderr file name"""

    return pilotstderrFilename

def tolog_file(msg):
    """ write date+msg to pilot log only """

    t = time.strftime("%d %b %Y %H:%M:%S", time.gmtime(time.time()))
    appendToLog("%s| %s\n" % (t, msg))

def appendToLog(txt):
    """ append txt to file """

    try:
        f = open(pilotlogFilename, 'a')
        f.write(txt)
        f.close()
    except Exception, e:
        if "No such file" in e:
            pass
        else:
            print "WARNING: Exception caught: %s" % e

def tologNew(msg, tofile=True, label='INFO', essential=False):
    """ Write message to pilot log and to stdout """

    # remove backquotes from the msg since they cause problems with batch submission of pilot
    # (might be present in error messages from the OS)
    msg = msg.replace("`","'")
    msg = msg.replace('"','\\"')

    import logging
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s')
    from Logger import Logger
    if essential:
        log = Logger(filename=essentialPilotlogFilename)
    else:
        log = Logger(filename=pilotlogFilename)

    if tofile:
        if label == 'INFO':
            log.info(msg)
        elif label == 'WARNING':
            log.warning(msg)
        elif label == 'DEBUG':
            log.debug(msg)
        elif label == 'ERROR':
            log.error(msg)
        elif label == 'CRITICAL':
            log.critical(msg)
        else:
            log.warning('Unknown label: %s' % (label))
            log.info(msg)
    else:
        print msg

    # write any serious messages to stderr
    if label == 'ERROR' or label == 'CRITICAL':
        print >> sys.stderr, msg # write any FAILED messages to stderr

def tolog(msg, tofile=True, label='INFO', essential=False):
    """ Write date+msg to pilot log and to stdout """

    try:
        import inspect

        MAXLENGTH = 12
        # getting the name of the module that is invoking tolog() and adjust the length
        try:
            module_name = os.path.basename(inspect.stack()[1][1])
        except Exception, e:
            module_name = "unknown"
            #print "Exception caught by tolog(): ", e,
        module_name_cut = module_name[0:MAXLENGTH].ljust(MAXLENGTH)
        msg = "%i|%s| %s" % (os.getpid(),module_name_cut, msg)

        t = timeStampUTC(format='%Y-%m-%d %H:%M:%S')
        if tofile:
            appendToLog("%s|%s\n" % (t, msg))

        # remove backquotes from the msg since they cause problems with batch submission of pilot
        # (might be present in error messages from the OS)
        msg = msg.replace("`","'")
        msg = msg.replace('"','\\"')
        print "%s| %s" % (t, msg)

        # write any FAILED messages to stderr
        if "!!FAILED!!" in msg:
            try:
                print >> sys.stderr, "%s| %s" % (t, msg)
            except:
                print "Failed to print to sys.stderr: %s" % (t, msg)
    except:
        print "!!WARNING!!4000!! %s" % traceback.format_exc()

def tolog_err(msg):
    """ write error string to log """
    tolog("!!WARNING!!4000!! %s" % str(msg))

def tolog_warn(msg):
    """ write warning string to log """
    tolog("!!WARNING!!4000!! %s" % str(msg))

def makeHTTPUpdate(state, node, port, url='pandaserver.cern.ch', path=None):
    """ make http connection to jobdispatcher """

    if state == 'finished' or state == 'failed' or state == 'holding':
        tolog("Preparing for final Panda server update")
        trial = 1
        max_trials = 10
        delay = 2*60 # seconds
        tolog("Max number of trials: %d, separated by delays of %d seconds" % (max_trials, delay))
    else:
        # standard non final update
        trial = 1
        max_trials = 1
        delay = None

    # make http connection to jobdispatcher
    while trial <= max_trials:
        # draw a random server URL
        _url = '%s:%s/server/panda' % (url, port)
        tolog("HTTP connect using server: %s" % (_url))
        ret = httpConnect(node, _url, path=path)
        if ret[0] and trial == max_trials: # non-zero exit code
            if delay: # final update
                tolog("!!FAILED!!4000!! [Trial %d/%d] Could not update Panda server (putting job in holding state if possible)" %\
                      (trial, max_trials))
                # state change will take place in postJobTask
                # (the internal pilot state will not be holding but lostheartbeat)
            else:
                tolog("!!WARNING!!4000!! [Trial %d/%d] Could not update Panda server, EC = %d" %\
                      (trial, max_trials, ret[0]))
            break
        elif ret[0]: # non-zero exit code
            tolog("!!WARNING!!4000!! [Trial %d/%d] Could not update Panda server, EC = %d" %\
                  (trial, max_trials, ret[0]))
            if delay: # final update
                tolog("Can not miss the final update. Will take a nap for %d seconds and then try again.." % (delay))
                trial += 1
                time.sleep(delay)
            else: # try again later
                tolog("Panda server update postponed..")
                break
        else:
            break
    return ret

def httpConnect(data, url, mode="UPDATE", sendproxy=False, path=None, experiment=""):
    """ function to handle the http connection """

    # check if a job should be downloaded or if it's a server update
    if mode == "GETJOB":
        cmd = 'getJob'
    elif mode == "ISTHEREANALYSISJOB":
        cmd = 'isThereAnalysisJob'
    elif mode == "GETSTATUS":
        cmd = 'getStatus'
    elif mode == "GETEVENTRANGES":
        cmd = 'getEventRanges'
    elif mode == "UPDATEEVENTRANGE":
        cmd = 'updateEventRange'
    elif mode == "UPDATEEVENTRANGES":
        cmd = 'updateEventRanges'
    elif mode == "GETKEYPAIR":
        cmd = 'getKeyPair'
    else:
        cmd = 'updateJob'

    # only send attemptNr with updateJob
    if cmd != 'updateJob' and data.has_key('attemptNr'):
        tolog("Removing attemptNr from node structure since it is not needed for command %s" % (cmd))
        del data['attemptNr']
    else:
        if data.has_key('attemptNr'):
            tolog("Sending attemptNr=%d for cmd=%s" % (data['attemptNr'], cmd))
        else:
            tolog("Will not send attemptNr for cmd=%s" % (cmd))

    # send the data dictionary to the dispatcher using command cmd
    # return format: status, parsed data, response
    return toServer(url, cmd, data, path, experiment)

def returnLogMsg(logf=None, linenum=20):
    ''' return the last N lines of log files into a string'''
    thisLog = ''
    if logf:
        if not os.path.isfile(logf):
            thisLog = "\n- No log file %s found -" % (logf)
        else:
            thisLog = "\n- Log from %s -" % (logf)
            f = open(logf)
            lines = f.readlines()
            f.close()

            if len(lines) <= linenum:
                ln = len(lines)
            else:
                ln = linenum

            for i in range(-ln,0):
                thisLog += lines[i]

    return thisLog

def findGuid(analJob, metadata_filename, filename):
    """ find guid in alternative file or generate it """

    guid = None
    metadata_path = os.path.dirname(metadata_filename)
    if os.path.exists(metadata_filename):
        # now grab the guids from the preprocessed metadata
        _guid = getGuidsFromXML(metadata_path, filename=filename, metadata=metadata_filename)
        if _guid != []:
            if _guid[0] != "":
                tolog("Found guid %s in %s (missing in PFC)" % (_guid[0], metadata_filename))
                guid = _guid[0]
            else:
                guid = None
        else:
            guid = None
    else:
        tolog("Could not locate file: %s" % (metadata_filename))

    if not guid:
        if analJob:
            guid = getGUID()
            tolog("Generated guid: %s" % (guid))
        else:
            tolog("Guid missing for file: %s (b)" % (filename))
            guid = None
    else:
        tolog("Guid identified")

    return guid

def preprocessMetadata(filename):
    """ remove META tags from metadata since they can contain value that minidom can not chandle """

    status = True

    # loop over the file and remove the META tags
    try:
        f = open(filename, "r")
    except Exception, e:
        tolog("!!WARNING!!2999!! Could not open file: %s (%s)" % (filename, e))
        status = False
    else:
        lines = f.readlines()
        f.close()

        # remove the META tags
        new_lines = ""
        for line in lines:
            if not "<META" in line and not "<metadata" in line:
                new_lines += line

        # remove the old file before recreating it
        try:
            os.remove(filename)
        except Exception, e:
            tolog("!!WARNING!!2999!! Could not remove file: %s (%s)" % (filename, e))
            status = False
        else:
            try:
                f = open(filename, "w")
            except Exception, e:
                tolog("!!WARNING!!2999!! Could not recreate file: %s (%s)" % (filename, e))
                status = False
            else:
                f.writelines(new_lines)
                f.close()
                tolog("New temporary metadata file:\n" + new_lines)

    return status

def prepareMetadata(metadata_filename):
    """ prepare the metadata for potential guid rescue """

    metadata_filename_BAK = metadata_filename + ".BAK"
    metadata_filename_ORG = metadata_filename

    if os.path.exists(metadata_filename):
        # first create a copy of the metadata
        try:
            shutil.copy2(metadata_filename, metadata_filename_BAK)
        except Exception, e:
            tolog("!!WARNING!!2999!! Could not copy metadata: %s" % (e))
        else:
            metadata_filename = metadata_filename_BAK
            tolog("Created file: %s" % (metadata_filename))

        # remove junk metadata
        try:
            status = preprocessMetadata(metadata_filename)
        except Exception, e:
            tolog("!!WARNING!!2999!! Could not preprocess metadata: %s" % (e))
            metadata_filename = metadata_filename_ORG
        else:
            if status:
                tolog("Successfully updated %s" % (metadata_filename))
            else:
                tolog("Could not update %s" % (metadata_filename))
                metadata_filename = metadata_filename_ORG
    else:
        tolog("Nothing for prepareMetadata() to do (file %s does not exist)" % (metadata_filename))

    return metadata_filename

def PFCxml(experiment, fname, fnlist=[], fguids=[], fntag=None, alog=None, alogguid=None, fsize=[], checksum=[], analJob=False, jr=False, additionalOutputFile=None, additionalOutputFileGuid=None, logToOS=False, archive=None):
    """ Create a PFC style XML file """

    # fnlist = output file list
    # fguids = output file guid list
    # fntag = pfn/lfn identifier
    # alog = name of log file
    # alogguid = guid of log file
    # fsize = file size list
    # checksum = checksum list
    # analJob = analysis job
    # jr = job recovery mode, guid generation by pilot not allowed

    # fntag = lfn is used for the metadata-<jobId>.xml that is sent to the server
    # fntag = pfn is used for OutPutFileCatalog.xml that is used by the mover for the stage out
    # The SURL will be added to the metadata file for fntag = lfn to allow for server side LFC registration

    status = True
    flist = []
    glist = []
    from SiteMover import SiteMover

    # get the experiment object
    thisExperiment = getExperiment(experiment)

    # for metadata.xml prepare the file for potential guid grabbing
    if "metadata" in fname and None in fguids:
        metadata_filename = prepareMetadata(fname + ".PAYLOAD")
    else:
        metadata_filename = fname

    # add log file
    if alog:
        flist.append(alog)
        if not alogguid:
            if not jr:
                alogguid = getGUID()
                tolog("Generated log guid: %s" % (alogguid))
            else:
                tolog("!!WARNING!!2999!! Log guid generation not allowed in recovery mode")
                alogguid = ''
                status = False
        glist.append(alogguid)

    # add additional output files (only for CERNVM, not NG or any other sites)
    if additionalOutputFile:
        flist.append(additionalOutputFile)
        if not additionalOutputFileGuid:
            additionalOutputFileGuid = getGUID()
        glist.append(additionalOutputFileGuid)

    if fnlist:
        flist = flist + fnlist
        tolog("fnlist = %s" % str(fnlist))
        tolog("fguids = %s" % str(fguids))
        for i in range(0, len(fnlist)):
            # check for guid
            try:
                _dummy = fguids[i]
                del _dummy
            except IndexError, e:
                guid = findGuid(analJob, metadata_filename, fnlist[i])
                if guid and guid != "":
                    tolog("Found guid for file: %s (%s)" % (fnlist[i], guid))
                else:
                    if not jr:
                        guid = getGUID()
                        tolog("Generated guid for file (%d): %s (%s)" % (i, fnlist[i], guid))
                    else:
                        tolog("!!WARNING!!2999!! Guid generation not allowed in recovery mode (file: %s)" % (fnlist[i]))
                        guid = ''
                        status = False
                fguids.insert(i, guid)
            else:
                if not fguids[i]:
                    guid = findGuid(analJob, metadata_filename, fnlist[i])
                    if guid and guid != "":
                        tolog("Found guid for file: %s (%s)" % (fnlist[i], guid))
                    else:
                        if not jr:
                            guid = getGUID()
                            tolog("Generated guid for file (%d): %s (%s)" % (i, fnlist[i], guid))
                        else:
                            tolog("!!WARNING!!2999!! Guid generation not allowed in recovery mode (file: %s)" % (fnlist[i]))
                            guid = ''
                            status = False
                    try:
                        fguids[i] = guid
                    except:
                        fguids.insert(i, guid)

            if fntag == "lfn":
                # check for file size
                try:
                    _dummy = fsize[i]
                    del _dummy
                except IndexError, e:
                    #print "This item doesn't exist"
                    fsize.insert(i, "")

                # check for checksum
                try:
                    _dummy = checksum[i]
                    del _dummy
                except IndexError, e:
                    #print "This item doesn't exist"
                    checksum.insert(i, "")

        glist = glist + fguids

    if fntag == "pfn":
        #create the PoolFileCatalog.xml-like file in the workdir
        fd = open(fname, "w")
        fd.write('<?xml version="1.0" encoding="UTF-8" standalone="no" ?>\n')
        fd.write("<!-- Edited By POOL -->\n")
        fd.write('<!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">\n')
        fd.write("<POOLFILECATALOG>\n")
        for i in range(0, len(flist)): # there's only one file in flist if it is for the object store
            fd.write('  <File ID="%s">\n' % (glist[i]))
            fd.write("    <physical>\n")
            fd.write('      <pfn filetype="ROOT_All" name="%s"/>\n' % (flist[i]))
            fd.write("    </physical>\n")
            fd.write("  </File>\n")
        fd.write("</POOLFILECATALOG>\n")
        fd.close()
    elif fntag == "lfn":
        # create the metadata.xml-like file that's needed by dispatcher jobs
        fd=open(fname, "w")
        fd.write('<?xml version="1.0" encoding="UTF-8" standalone="no" ?>\n')
        fd.write("<!-- ATLAS file meta-data catalog -->\n")
        fd.write('<!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">\n')
        fd.write("<POOLFILECATALOG>\n")
        for i in range(0, len(flist)):
            fd.write('  <File ID="%s">\n' % (glist[i]))
            fd.write("    <logical>\n")
            fd.write('      <lfn name="%s"/>\n' % (flist[i]))
            fd.write("    </logical>\n")

            # if the log is to be transferred to an OS, add an endpoint tag
            if logToOS and alog == flist[i]:
                fd.write('    <endpoint>%s-ddmendpoint_tobeset</endpoint>\n' % (alog))

            # add SURL metadata (not known yet) for server LFC registration
            # use the GUID as identifier (the string "<GUID>-surltobeset" will later be replaced with the SURL)
            if thisExperiment:
                special_xml = thisExperiment.getMetadataForRegistration(glist[i])
                if special_xml != "":
                    fd.write(special_xml)

            # add log file metadata later (not known yet)
            if flist[i] == alog:
                fd.write('    <metadata att_name="fsize" att_value=""/>\n')
                fd.write('    <metadata att_name="csumtypetobeset" att_value=""/>\n')
            elif (additionalOutputFile and flist[i] == additionalOutputFile):
                if ".xml" in additionalOutputFile:
                    fd.write('    <metadata att_name="fsizeXML" att_value=""/>\n')
                    fd.write('    <metadata att_name="csumtypetobesetXML" att_value=""/>\n')
                else:
                    fd.write('    <metadata att_name="fsizeAdditional" att_value=""/>\n')
                    fd.write('    <metadata att_name="csumtypetobesetAdditional" att_value=""/>\n')
            else:
                if len(fsize) != 0:
                    fd.write('    <metadata att_name="fsize" att_value="%s"/>\n' % (fsize[i]))
                if len(checksum) != 0:
                    fd.write('    <metadata att_name="%s" att_value="%s"/>\n' %\
                             (SiteMover.getChecksumType(checksum[i]), checksum[i]))
            fd.write("  </File>\n")
        fd.write("</POOLFILECATALOG>\n")
        fd.close()
    else:
        tolog("!!WARNING!!1234!! fntag is neither lfn nor pfn, did not manage to create the XML file for output files")

    # dump the file to the log
    dumpFile(fname, topilotlog=True)

    return status

def stageInPyModules(initdir, workdir):
    """ copy pilot python modules into pilot workdir from condor initial dir """

    status = True
    ec = 0
    tolog('initdir is %s '%initdir)
    tolog('workdir is %s '%workdir)

    if workdir and initdir:
        for k in getFileList():
            if os.path.isfile("%s/%s" % (initdir, k)):
                try:
                    shutil.copy2("%s/%s" % (initdir, k), workdir)
                except Exception, e:
                    tolog("!!WARNING!!2999!! stageInPyModules failed to copy file %s/%s to %s: %s" % (initdir, k, workdir, e))
                    status = False
                    break
            elif os.path.isdir("%s/%s" % (initdir, k)):
                try:
                    shutil.copytree("%s/%s" % (initdir, k), "%s/%s" % (workdir,k))
                except Exception, e:
                    tolog("!!WARNING!!2999!! stageInPyModules failed to copy directory %s/%s to %s: %s" % (initdir, k, workdir, e))
                    status = False
                    break
            else:
                tolog("!!WARNING!!2999!! File missing during stage in: %s/%s" % (initdir, k))

    if status:
        tolog("Pilot modules have been copied to %s" % (workdir))
    else:
        # get error handler
        error = PilotErrors()
        ec = error.ERR_GENERALERROR

    return ec

def removePyModules(_dir):
    """ Remove pilot python modules from workdir """

    if _dir:
        for k in getFileList(path_dir=_dir):
            if not "runargs" in k:
                try:
                    os.system("rm -rf %s/%s*" % (_dir, k))
                except:
                    pass

def timeStamp():
    """ return ISO-8601 compliant date/time format """

    tmptz = time.timezone
    if tmptz > 0:
        signstr = '-'
    else:
        signstr = '+'
    tmptz_hours = int(tmptz/3600)

    return str("%s%s%02d%02d" % (time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), signstr, tmptz_hours, int(tmptz/60-tmptz_hours*60)))

def timeStampUTC(t=None, format="%d %b %H:%M:%S"):
    """ return UTC time stamp """

    if not t:
        t = time.time()
    return time.strftime(format, time.gmtime(t))

def getJobStatus(jobId, pshttpurl, psport, path):
    """
    Return the current status of job <jobId> from the dispatcher
    typical dispatcher response: 'status=finished&StatusCode=0'
    StatusCode  0: succeeded
               10: time-out
               20: general error
               30: failed
    In the case of time-out, the dispatcher will be asked one more time after 10s
    """

    status = 'unknown'
    StatusCode = -1
    nod = {}
    nod['ids'] = jobId
    url = "%s:%s/server/panda/getStatus" % (pshttpurl, repr(psport))
    # ask dispatcher about lost job status
    trial = 1
    max_trials = 2
    while trial <= max_trials:
        try:
            # open connection
            ret = httpConnect(nod, url, path=path, mode="GETSTATUS")
            response = ret[1]
            tolog("response: %s" % str(response))
            if response:
                try:
                    # decode the response
                    # eg. var = ['status=notfound', 'attemptNr=0', 'StatusCode=0']
                    # = response

                    # create a dictionary of the response (protects against future updates)
                    # eg. dic = {'status': 'activated', 'attemptNr': '0', 'StatusCode': '0'}
#                    dic = {}
#                    for i in range(len(var)):
#                        key = var[i].split('=')[0]
#                        dic[key] = var[i].split('=')[1]

                    status = response['status']              # e.g. 'holding'
                    attemptNr = int(response['attemptNr'])   # e.g. '0'
                    StatusCode = int(response['StatusCode']) # e.g. '0'
                except Exception, e:
                    tolog("!!WARNING!!2997!! Exception: Dispatcher did not return allowed values: %s, %s" % (str(ret), e))
                    status = "unknown"
                    attemptNr = -1
                    StatusCode = 20
            else:
                tolog("!!WARNING!!2998!! Dispatcher did not return allowed values: %s" % str(ret))
                status = "unknown"
                attemptNr = -1
                StatusCode = 20
        except Exception, e:
            tolog("Could not interpret job status from dispatcher: %s" % (e))
            status = 'unknown'
            attemptNr = -1
            StatusCode = -1
            break
        else:
            if StatusCode == 0: # success
                break
            elif StatusCode == 10: # time-out
                trial = trial + 1
                time.sleep(10)
                continue
            elif StatusCode == 20: # other error
                if ret[0] == 13056 or ret[0] == '13056':
                    tolog("Wrong certificate used with curl operation? (encountered error 13056)")
                break
            else: # general error
                break

    return status, attemptNr, StatusCode

def getExitCode(path, filename):
    """ Try to read the exit code from the pilot stdout log """

    ec = -1

    # first create a tmp file with only the last few lines of the status file to avoid
    # reading a potentially long status file
    tmp_file_name = "tmp-tail-dump-file"
    try:
        os.system("tail %s/%s >%s/%s" % (path, filename, path, tmp_file_name))
    except Exception, e:
        tolog("Job Recovery could not create tmp file %s: %s" % (tmp_file_name, e))
    else:
        # open the tmp file and look for the pilot exit info
        try:
            tmp_file = open(tmp_file_name, 'r')
        except IOError:
            tolog("Job Recovery could not open tmp file")
        else:
            try:
                all_lines = tmp_file.readlines()
            except Exception, e:
                tolog("Job Recovery could not read tmp file %s: %s" % (tmp_file_name, e))

            tmp_file.close()

            # remove the tmp file
            try:
                os.remove(tmp_file_name)
            except OSError:
                tolog("Job Recovery could not remove tmp file: %s" % tmp_file)

            # now check the pilot exit info, if is has an exit code - remove the directory
            exit_info = re.compile(r"job ended with \(trf,pilot\) exit code of \(\d+,\d+\)")
            exitinfo_has_exit_code = False
            for line in all_lines:
                if re.findall(exit_info, line):
                    exitinfo_has_exit_code = True

            if exitinfo_has_exit_code:
                tolog("Job had an exit code")

                # ...
            else:
                tolog("Job had no exit code")

    return ec

def getRemainingOutputFiles(outFiles):
    """
    Return list of files if there are remaining output files in the lost job data directory
    """
    remaining_files = []

    for file_name in outFiles:
        if os.path.exists(file_name):
            remaining_files.append(file_name)

    return remaining_files

def remove(entries):
    """
    Remove files and directories
    entries should be a list
    """
    status = True

    # protect against wrong usage
    if type(entries) == list:
        if len(entries) > 0:
            for entry in entries:
                try:
                    os.system("rm -rf %s" %entry)
                except OSError:
                    tolog("Could not remove %s" % entry)
                    status = False
    else:
        tolog("Argument has wrong type, expected list: %s" % str(type(entries)))
        status = False
    return status

def getCPUmodel():
    """ Get cpu model and cache size from /proc/cpuinfo """
    # model name      : Intel(R) Xeon(TM) CPU 2.40GHz
    # cache size      : 512 KB
    # gives the return string "Intel(R) Xeon(TM) CPU 2.40GHz 512 KB"

    cpumodel = ''
    cpucache = ''
    modelstring = ''
    try:
        f = open('/proc/cpuinfo', 'r')
    except Exception, e:
        tolog("Could not open /proc/cpuinfo: %s" % e)
    else:
        re_model = re.compile('^model name\s+:\s+(\w.+)')
        re_cache = re.compile('^cache size\s+:\s+(\d+ KB)')

        # loop over all lines in cpuinfo
        for line in f.readlines():
            # try to grab cpumodel from current line
            model = re_model.search(line)
            if model:
                # found cpu model
                cpumodel = model.group(1)

            # try to grab cache size from current line
            cache = re_cache.search(line)
            if cache:
                # found cache size
                cpucache = cache.group(1)

            # stop after 1st pair found - can be multiple cpus
            if cpumodel and cpucache:
                # create return string
                modelstring = cpumodel + " " + cpucache
                break

        f.close()

    # default return string if no info was found
    if not modelstring:
        modelstring = "UNKNOWN"

    return modelstring

def getExeErrors(startdir, fileName):
    """ Extract exeErrorCode and exeErrorDiag from jobInfo.xml """

    exeErrorCode = 0
    exeErrorDiag = ""

    # first check if the xml file exists (e.g. it doesn't exist for a test job)
    findFlag = False
    line = ""
    # try to locate the file
    out = commands.getoutput("find %s -name %s" % (startdir, fileName))
    if out != "":
        for line in out.split('\n'):
            tolog("Found trf error file at: %s" % (line))
            findFlag = True
            break # just in case, but there should only be one trf error file
    if findFlag:
        if os.path.isfile(line):
            # import the xml functions
            from xml.sax import make_parser
            from xml.sax.handler import feature_namespaces
            import JobInfoXML

            # Create a parser
            parser = make_parser()

            # Tell the parser we are not interested in XML namespaces
            parser.setFeature(feature_namespaces, 0)

            # Create the handler
            dh = JobInfoXML.JobInfoXML()

            # Tell the parser to use our handler
            parser.setContentHandler(dh)

            # Parse the input
            parser.parse(line)

            # get the error code and the error message(s)
            exeErrorCode = dh.getCode()
            exeErrorDiag = dh.getMessage()
        else:
            tolog("Could not read trf error file: %s" % (line))
    else:
        tolog("Could not find trf error file %s in search path %s" % (fileName, startdir))

    # only return a maximum of 250 characters in the error message (as defined in PandaDB)
    return exeErrorCode, exeErrorDiag[:250]

def debugInfo(_str, tofile=True):
    """ print the debug string to stdout"""

    tolog("DEBUG: %s" % (_str), tofile=tofile)

def isBuildJob(outFiles):
    """
    Check if the job is a build job
    (i.e. check if the job only has one output file that is a lib file)
    """

    isABuildJob = False
    # outFiles only contains a single file for build jobs, the lib file
    if len(outFiles) == 1:
        # e.g. outFiles[0] = user.paulnilsson.lxplus001.lib._001.log.tgz
        if outFiles[0].find(".lib.") > 0:
            isABuildJob = True

    return isABuildJob

def OSBitsCheck():
    """ determine whether the platform is a 32 or 64-bit OS """

    b = -1
    try:
        a = commands.getoutput('uname -a')
        b = a.find('x86_64')
    except:
        return 32 # default
    else:
        if b == -1 : # 32 bit OS
            return 32
        else: # 64 bits OS
            return 64

def uniqueList(input_list):
    """
    return a list of unique entries
    input_list = ['1', '1', '2'] -> ['1', '2']
    """

    u = {}
    for x in input_list:
        u[x] = 1
    return u.keys()

def diffLists(list1, list2):
    """
    compare the input lists (len(list1) must be > len(list2))
    and return the difference
    """

    d = {}
    for x in list1:
        d[x] = 1
    for x in list2:
        if d.has_key(x):
            del d[x]
    return d.keys()

def getOutputFileInfo(outputFiles, checksum_cmd, skiplog=False, logFile=""):
    """
    Return lists with file sizes and checksums for the given output files
    """

    tolog("getOutputFileInfo")

    ec = 0
    fsize = []
    checksum = []

    # get error handler
    error = PilotErrors()
    pilotErrorDiag = ""

    # add the log file if necessary (when this function is called from RunJob)
    # WARNING: temporary redundancy. fsize and checksum is checked again in mover code, merge later
    if logFile != "":
        outputFiles.insert(0, logFile)

    for filename in outputFiles:
        # add "" for the log metadata since it has not been created yet
        if filename == logFile and skiplog:
            ec = -1
        else:
            from SiteMover import SiteMover
            ec, pilotErrorDiag, _fsize, _checksum = SiteMover.getLocalFileInfo(filename, csumtype=checksum_cmd)
            tolog("Adding %s,%s for file %s using %s" % (_fsize, _checksum, filename, checksum_cmd))
        if ec == 0:
            fsize.append(_fsize)
            checksum.append(_checksum)
        else:
            if ec == error.ERR_FAILEDMD5LOCAL or ec == error.ERR_FAILEDADLOCAL:
                fsize.append(_fsize)
                checksum.append("")
            else:
                fsize.append("")
                checksum.append("")
            if ec != -1: # skip error message for log
                tolog("!!WARNING!!4000!! getFileInfo received an error from getLocalFileInfo for file: %s" % (filename))
                tolog("!!WARNING!!4000!! ec = %d, pilotErrorDiag = %s, fsize = %s, checksum = %s" %\
                      (ec, pilotErrorDiag, str(_fsize), str(_checksum)))
            else:
                tolog("setting ec=0 (%d)" % (ec))
                # do not return -1 as en error message since it only applies to log files
                ec = 0
    if logFile != "":
        outputFiles.remove(logFile)

    #tolog("Going to return %d,%s,%s,%s" % (ec, pilotErrorDiag, fsize, checksum))
    return ec, pilotErrorDiag, fsize, checksum

def updateMetadata(fname, fsize, checksum, format=None, fsizeXML=None, checksumXML=None, fsizeAdditional=None, checksumAdditional=None):
    """
    Add the fsize and checksum values for the log (left empty until this point)
    Return exit code and xml
    If format = NG, then the NorduGrid format of the metadata will be assumed
    fsizeXML and checksumXML are extra attributes needed for CERNVM xml file handling
    """
    ec = 0
    lines = ""

    try:
        f = open(fname, 'r')
    except Exception, e:
        tolog("Failed to open metadata file: %s" % e)
        ec = -1
    else:
        if format == 'NG':
            metadata1 = '<size></size>'
            new_metadata1 = '<size>%s</size>' % (fsize)
        else:
            metadata1 = '<metadata att_name="fsize" att_value=""/>'
            new_metadata1 = '<metadata att_name="fsize" att_value="%s"/>' % (fsize)

        # find out if checksum or adler32 should be added
        from SiteMover import SiteMover
        csumtype = SiteMover.getChecksumType(checksum)

        # special handling for CERNVM metadata
        if checksumXML:
            metadata4 = '<metadata att_name="csumtypetobesetXML" att_value=""/>'
        else:
            metadata4 = 'notused'
        if fsizeXML:
            metadata5 = '<metadata att_name="fsizeXML" att_value=""/>'
            new_metadata5 = '<metadata att_name="fsize" att_value="%s"/>' % (fsizeXML)
        else:
            metadata5 = 'notused'
            new_metadata5 = ''
        if checksumAdditional:
            metadata6 = '<metadata att_name="csumtypetobesetXML" att_value=""/>'
            if csumtype == "adler32":
                new_metadata6 = '<ad32>%s</ad32>' % (checksumAdditional)
            else:
                new_metadata6 = '<md5sum>%s</md5sum>' % (checksumAdditional)
        else:
            metadata6 = 'notused'
            new_metadata6 = ''
        if fsizeAdditional:
            metadata7 = '<metadata att_name="fsizeAdditional" att_value=""/>'
            new_metadata7 = '<metadata att_name="fsize" att_value="%s"/>' % (fsizeAdditional)
        else:
            metadata7 = 'notused'
            new_metadata7 = ''

        # for NG and CERNVM
        if format == 'NG':
            if csumtype == "adler32":
                metadata2 = '<ad32></ad32>'
                new_metadata2 = '<ad32>%s</ad32>' % (checksum)
            else:
                metadata2 = '<md5sum></md5sum>'
                new_metadata2 = '<md5sum>%s</md5sum>' % (checksum)
        else:
            if csumtype == "adler32":
                metadata2 = '<metadata att_name="adler32" att_value=""/>'
                new_metadata2 = '<metadata att_name="adler32" att_value="%s"/>' % (checksum)
            else:
                metadata2 = '<metadata att_name="md5sum" att_value=""/>'
                new_metadata2 = '<metadata att_name="md5sum" att_value="%s"/>' % (checksum)
        metadata3 = '<metadata att_name="csumtypetobeset" att_value=""/>'

        for line in f.readlines():
            newline = ""
            if line.find(metadata1) != -1:
                newline = line.replace(metadata1, new_metadata1)
                lines += newline
            elif line.find(metadata2) != -1:
                newline = line.replace(metadata2, new_metadata2)
                lines += newline
            elif line.find(metadata3) != -1:
                newline = line.replace(metadata3, new_metadata2)
                lines += newline
            elif line.find(metadata4) != -1:
                newline = line.replace(metadata4, new_metadata2)
                lines += newline
            elif line.find(metadata5) != -1:
                newline = line.replace(metadata5, new_metadata5)
                lines += newline
            elif line.find(metadata6) != -1:
                newline = line.replace(metadata6, new_metadata6)
                lines += newline
            elif line.find(metadata7) != -1:
                newline = line.replace(metadata7, new_metadata7)
                lines += newline
            elif line.find('csumtypetobeset') != -1:
                newline = line.replace()
            else:
                lines += line

        f.close()
        try:
            f = open(fname, 'w')
            f.write(lines)
            f.close()
        except Exception, e:
            tolog("Failed to write new metadata for log: %s" % e)
            ec = -1

    return ec, lines

def removeFiles(dir, _fileList):
    """
    Remove files from the work dir
    """

    ec = 0
    found = 0
    for _file in _fileList:
        if os.path.isfile("%s/%s" % (dir, _file)):
            try:
                os.remove("%s/%s" % (dir, _file))
            except Exception, e:
                tolog("Failed to remove file: %s/%s, %s" % (dir, _file, e))
                ec = 1
            else:
                tolog("Removed file: %s/%s" % (dir, _file))
                found += 1
    if found > 0:
        tolog("Removed %d/%d file(s)" % (found, len(_fileList)))

    return ec

def createPoolFileCatalog(file_dictionary, lfns, pfc_name="PoolFileCatalog.xml", forceLogical=False):
    """
    Create the PoolFileCatalog.xml
    file_dictionary = { guid1 : sfn1, ... }
    Adapted from R. Walker's code
    Note: the logical tag can cause problems in some cases (for the payload).
    """

    outxml = ''
    if len(file_dictionary) == 0:
        tolog('No input files so no PFC created')
    else:
        dom = minidom.getDOMImplementation()
        doctype = dom.createDocumentType("POOLFILECATALOG","","InMemory")
        doc = dom.createDocument(None, "POOLFILECATALOG", doctype)
        root = doc.documentElement
        doc.appendChild(root)

        # Prepare plain text as can`t trust minidom on python <2.3
        pfc_text = '<?xml version="1.0" ?>\n'
        pfc_text += '<!-- Edited By the PanDA Pilot -->\n'
        pfc_text += '<!DOCTYPE POOLFILECATALOG  SYSTEM "InMemory">\n'
        pfc_text += '<POOLFILECATALOG>\n'

        # Strip .N because stagein makes soft link, and params have no .N
        for guid in file_dictionary.keys():
            sfn = file_dictionary[guid]
            ftype='ROOT_All'

            _file = doc.createElement("File")
            _file.setAttribute('ID', guid)
            root.appendChild(_file)

            # physical element - file in local directory without .N extension
            physical = doc.createElement("physical")
            _file.appendChild(physical)

            pfn = doc.createElement('pfn')
            pfn.setAttribute('filetype', ftype)
            pfn.setAttribute('name', sfn)
            physical.appendChild(pfn)

            # forceLogical is set for TURL based PFCs. In this case, the LFN must not contain any legacy __DQ2-parts
            if forceLogical:
                logical = doc.createElement('logical')
                logical.setAttribute('name', os.path.basename(sfn))
                _file.appendChild(logical)

                # remove any legacy __DQ2 substring from the LFN if necessary
                _lfn = getLFN(sfn, lfns) #os.path.basename(sfn)
                if "__DQ2" in _lfn:
                    _lfn = stripDQ2FromLFN(_lfn)

                # remove any rucio :-separator if present
                if ":" in _lfn:
                    _lfn = _lfn.split(":")[1]

                pfc_text += '  <File ID="%s">\n    <physical>\n      <pfn filetype="%s" name="%s"/>\n    </physical>\n    <logical>\n      <lfn name="%s"/>\n    </logical>\n  </File>\n' % (guid, ftype, sfn, _lfn)

            else:
                logical = doc.createElement('logical')
                _file.appendChild(logical)
                pfc_text += '  <File ID="%s">\n    <physical>\n      <pfn filetype="%s" name="%s"/>\n    </physical>\n    <logical/>\n  </File>\n' %\
                            (guid, ftype, sfn)

        pfc_text += '</POOLFILECATALOG>\n'
        # tolog(str(doc.toxml()))

        # add escape character for & (needed for google turls)
        if '&' in pfc_text:
            pfc_text = pfc_text.replace('&',  '&#038;')

        tolog(pfc_text)

        try:
            tolog("Writing XML to %s" % (pfc_name))
            f = open(pfc_name, 'w')
            f.write(pfc_text)
        except Exception, e:
            tolog("!!WARNING!!2999!! Could not create XML file: %s" % (e))
        else:
            tolog("Created PFC XML")
            f.close()

        outxml = pfc_text

    return outxml

def replace(filename, stext, rtext):
    """ replace string stext with rtext in file filename """

    status = True
    try:
        _input = open(filename, "r")
    except Exception, e:
        tolog("!!WARNING!!4000!! Open failed with %s" % e)
        status = False
    else:
        try:
            output = open(filename + "_tmp", "w")
        except Exception, e:
            tolog("!!WARNING!!4000!! Open failed with %s" % e)
            status = False
            _input.close()
        else:
            for s in _input.xreadlines():
                output.write(s.replace(stext, rtext))
            _input.close()
            # rename tmp file and overwrite original file
            try:
                os.rename(filename + "_tmp", filename)
            except Exception, e:
                tolog("!!WARNING!!4000!! Rename failed with %s" % e)
                status = False
            output.close()

    return status

def getDirectAccessDic(qdata):
    """ return the directAccess dictionary in case the site supports direct access / file stager """
    # task: create structure
    # directAccess = {
    #                  'oldPrefix' : 'gsiftp://osgserv04.slac.stanford.edu/xrootd/atlas',
    #                  'newPrefix' : 'root://atl-xrdr.slac.stanford.edu:1094//atlas/xrootd',
    #                  'useCopyTool' : False,
    #                  'directIn' : True
    #                  'useFileStager' : True
    #                }
    # from queuedata variable copysetup
    # copysetup = setup_string^oldPrefix^newPrefix^useFileStager^directIn
    # example:
    # copysetup=^gsiftp://osgserv04.slac.stanford.edu/xrootd/atlas^root://atl-xrdr.slac.stanford.edu:1094//atlas/xrootd^False^True
    # (setup_string not used)
    # (all cases tested)
    # qdata = 'whatever^gsiftp://osgserv04.slac.stanford.edu/xrootd/atlas^root://atl-xrdr.slac.stanford.edu:1094//atlas/xrootd^False^True'
    # qdata = '^gsiftp://osgserv04.slac.stanford.edu/xrootd/atlas^root://atl-xrdr.slac.stanford.edu:1094//atlas/xrootd^False^True'
    # qdata = 'gsiftp://osgserv04.slac.stanford.edu/xrootd/atlas^root://atl-xrdr.slac.stanford.edu:1094//atlas/xrootd^False^True'
    # qdata = 'setup^gsiftp://osgserv04.slac.stanford.edu/xrootd/atlas^root://atl-xrdr.slac.stanford.edu:1094//atlas/xrootd^False^False^False'
    # qdata = '' or 'whatever'
    # For TURL PFC creation, the copysetup has the following structure
    # copysetup = setup_string^useFileStager^directIn
    directAccess = None

    if qdata.find('^') > -1:
        n = qdata.count('^')
        i = 0
        # protect against a forgotten inital ^ in case the setup_string is empty!
        if n >= 2 and n <= 5:
            # read data
            data = qdata.split('^')

            # get the setup file (actually not used here)
            # _setup = data[0]

            if n != 2:
                # get file transfer prefices
                i += 1
                oldPrefix = data[i]
                i += 1
                newPrefix = data[i]
            else:
                oldPrefix = ""
                newPrefix = ""

            # get file stager mode
            i += 1
            if data[i].lower() == 'true':
                useFileStager = True
                useCopyTool = False
            else:
                useFileStager = False
                useCopyTool = True

            # get direct access mode
            i += 1
            if data[i].lower() == 'true':
                directIn = True
                useCopyTool = False
            else:
                directIn = False
                if not useFileStager:
                    useCopyTool = True
                else:
                    useCopyTool = False
#                if useFileStager:
#                    tolog("!!WARNING!!2999!! direct access mode reset to True (can not be False in combination with file stager mode)")
#                    directIn = True
#                    useCopyTool = False
#                else:
#                    directIn = False
#                    useCopyTool = True

            # in case copysetup contains a third boolean (old)
            if n == 5:
                tolog("!!WARNING!!2999!! Update schedconfig to use new direct access format: copysetup = setup_string^oldPrefix^newPrefix^useFileStager^directIn")
                if data[n].lower() == 'true':
                    useFileStager = True
                    directIn = True
                    useCopyTool = False
                else:
                    useFileStager = False
                    directIn = False
                    useCopyTool = True

            # create structure
            directAccess = {
                'oldPrefix': oldPrefix,
                'newPrefix': newPrefix,
                'useCopyTool': useCopyTool,
                'directIn': directIn,
                'useFileStager': useFileStager
                }
        else:
            tolog("!!WARNING!!4000!! copysetup has wrong format: %s" % (qdata))
    else:
        # do nothing, don't care about the copysetup right now (only later in Mover)
        pass

    return directAccess

def getErrors(filename):
    """ get all !!TEXT!!NUMBER!!... errors from file """

    ret = ""
    try:
        f = open(filename)
        lines = f.readlines()
        f.close()
    except Exception, e:
        tolog("!!WARNING!!4000!! could not open/read file: %s" % e)
    else:
        p = r"!!(\S+)!!\d+!!"
        pattern = re.compile(p)
        for line in lines:
            if re.findall(pattern, line):
                ret += line

    return ret

def getLFN(pfn, lfns):
    """ Identify the LFN from the list of LFNs corresponding to a PFN """
    # Note: important since the basename of the PFN can contain additional characters,
    # e.g. PFN = /../data15_cos.00259101.physics_IDCosmic.merge.RAW._lb0116._SFO-ALL._0001.1_1427497374
    # but LFN = data15_cos.00259101.physics_IDCosmic.merge.RAW._lb0116._SFO-ALL._0001.1

    lfn = ""
    for _lfn in lfns:
        _basename = os.path.basename(pfn)
        if _lfn in _basename:
#        if _basename.endswith(_lfn):
            # Handle scopes in case they are present
            if ":" in _basename:
                l = _basename.split(":")
                lfn = l[1]
            else:
                lfn = _lfn

    if lfn == "":
        tolog("!!WARNING!!2323!! Correct LFN could not be identified: pfn=%s, lfns=%s (assume basename of PFN)" % (pfn, str(lfns)))
        lfn = os.path.basename(pfn)

    return lfn

def makeTransRegReport(all_transferred, some_transferred, latereg, nr_transferred, nr_files, fail, ec, ret, fields):
    """ make the transfer and registration report """

    error = PilotErrors()

    tolog("")
    tolog("..Transfer and registration report.........................................................................")
    tolog(". Mover has finished")

    if all_transferred and not latereg:
        if nr_files > 1:
            tolog(". All (%d) files have been transferred and registered" % (nr_files))
        else:
            tolog(". The single file has been transferred and registered")
    elif all_transferred and latereg:
        if nr_files > 1:
            tolog(". All (%d) files have been transferred but not registered" % (nr_files))
            tolog(". The files will be registrered by a later pilot if job recovery is supported,")
        else:
            tolog(". The single file has been transferred but not registered")
            tolog(". The file will be registrered by a later pilot if job recovery is supported,")
        tolog(". otherwise this job will fail")
    elif some_transferred and latereg:
        tolog(". Some files (%d/%d) were transferred but no file was registered" % (nr_transferred, nr_files))
        tolog(". The remaining files will be transferred and registrered by a later pilot if job recovery is supported,")
        tolog(". otherwise this job will fail")
    elif some_transferred and not latereg:
        tolog(". Some files (%d/%d) were transferred and registered" % (nr_transferred, nr_files))
        tolog(". The remaining files will be transferred and registrered by a later pilot if job recovery is supported,")
        tolog(". otherwise this job will fail")
    elif not some_transferred:
        tolog(". No files (%d/%d) were transferred or registered" % (nr_transferred, nr_files))
        if nr_files > 1:
            tolog(". The files will be transferred and registrered by a later pilot if job recovery is supported,")
        else:
            tolog(". The file will be transferred and registrered by a later pilot if job recovery is supported,")
            tolog(". otherwise this job will fail")
    else:
        tolog(". Mover has finished")

    if fail != 0:
        tolog(". File transfer exit code                       : (%d, %s)" % (fail, error.getErrorStr(fail)))
    else:
        tolog(". File transfer exit code                       : (%d, <no error>)" % (fail))

    if some_transferred:
        tolog(". File registration return values               : (%d, %s, %s)" %\
              (ec, error.getErrorStr(ec), str(ret)))

    tolog(". Put function will return fields               : %s" % str(fields))
    tolog(". Transfer and registration report produced at  : %s" % timeStamp())
    tolog("...........................................................................................................")
    tolog("")

def hasBeenTransferred(fields):
    """ determine whether files were successfully transferred """

    status = False
    s = 0
    # the fields will all be empty if no files were transferred
    for field in fields:
        s += len(field)
    if s > 0:
        status = True
    return status

def removeSRMInfo(f0):
    """ remove any SRM info from the f0 string """

    from SiteMover import SiteMover
    fields0 = ""
    for pfns in f0.split("+"):
        if pfns != "":
            pfns = SiteMover.stripPortAndVersion(pfns)
            fields0 += "%s+" % (pfns)

    # remove any trailing +-sign
    if fields0[-1] == "+":
        fields0 = fields0[:-1]

    if fields0 == "":
        fields0 = f0
    if f0 != fields0:
        tolog("removeSRMInfo() has updated %s to %s" % (f0, fields0))

    return fields0

def isAnalysisJob(trf):
    """ Determine whether the job is an analysis job or not """

    if (trf.startswith('https://') or trf.startswith('http://')):
        analysisJob = True
    else:
        analysisJob = False

    return analysisJob

def timedCommand(cmd, timeout=300):
    """ Protect cmd with timed_command """

    tolog("Executing command: %s (protected by timed_command, timeout: %d s)" % (cmd, timeout))
    t0 = os.times()
    try:
        from TimerCommand import TimerCommand
        timerCommand = TimerCommand(cmd)
        exitcode, output = timerCommand.run(timeout=timeout)
    except Exception, e:
        pilotErrorDiag = 'TimedCommand() threw an exception: %s' % e
        tolog("!!WARNING!!2220!! %s" % pilotErrorDiag)
        exitcode = 1
        output = str(e)
    else:
        if exitcode != 0:
            tolog("!!WARNING!!2220!! Timed command returned: %s" % (output))

    t1 = os.times()
    telapsed = int(round(t1[4] - t0[4]))
    tolog("Elapsed time: %d" % (telapsed))

    if telapsed >= timeout:
        tolog("!!WARNING!!2220!! Command timed out")
        output += " (timed out)"

    # timed_command adds a trailing \n, remove it
    if output.endswith('\n'):
        output = output[:-1]

    return exitcode, output

def stringToFields(jobFields):
    """ Convert a jobState string to a fields array """

    jobFields = jobFields.replace('[','').replace(']','')
    jobFields = jobFields.replace("\'","")
    rf = jobFields.split(',')
    fields = []
    for f in rf:
        fields += [f.strip()]

    return fields

def readpar(parameter, alt=False, version=0, queuename=None):
    """ Read 'parameter' from queuedata via SiteInformation class """

    from SiteInformation import SiteInformation
    si = SiteInformation()

    return si.readpar(parameter, alt=alt, version=version, queuename=queuename)

def getBatchSystemJobID():
    """ return the batch system job id (will be reported to the server) """

    # BQS (e.g. LYON)
    if os.environ.has_key("QSUB_REQNAME"):
        return "BQS", os.environ["QSUB_REQNAME"]
    # BQS alternative
    if os.environ.has_key("BQSCLUSTER"):
        return "BQS", os.environ["BQSCLUSTER"]
    # Torque
    if os.environ.has_key("PBS_JOBID"):
        return "Torque", os.environ["PBS_JOBID"]
    # LSF
    if os.environ.has_key("LSB_JOBID"):
        return "LSF", os.environ["LSB_JOBID"]
    # Sun's Grid Engine
    if os.environ.has_key("JOB_ID"):
        return "Grid Engine", os.environ["JOB_ID"]
    # Condor (variable sent through job submit file)
    if os.environ.has_key("clusterid"):
        return "Condor", os.environ["clusterid"]
    # Condor (get jobid from classad file)
    if os.environ.has_key("_CONDOR_JOB_AD"):
        return "Condor", commands.getoutput('sed -n "s/GlobalJobId.*\\"\\(.*\\)\\".*/\\1/p" %s' % os.environ["_CONDOR_JOB_AD"])
    # SLURM
    if os.environ.has_key("SLURM_JOB_ID"):
        return "SLURM", os.environ["SLURM_JOB_ID"]

#    # Condor (id unknown)
#    if os.environ.has_key("_CONDOR_SCRATCH_DIR"):
#        return "Condor", "(unknown clusterid)"

    return None, ""

def touch(filename):
    """ touch a file """

    if not os.path.isfile(filename):
        try:
            os.system("touch %s" % (filename))
        except Exception, e:
            tolog("!!WARNING!!1000!! Failed to touch file: %s" % e)
        else:
            tolog("Lock file created: %s" % (filename))

def createLockFile(jobrec, workdir, lockfile="LOCKFILE"):
    """
    Site workdir protection to prevent the work dir from being deleted by the cleanup
    function if pilot fails to register the log
    """

    # only try to create a lock file if it doesn't exist already
    # do not bother to create it if the site doesn't allow for job recovery
    f = "%s/%s" % (workdir, lockfile)
    if lockfile == "LOCKFILE":
        if jobrec:
            touch(f)
    else:
        touch(f)

def checkLockFile(workdir, lockfile):
    """checks if a lockfile exists in path
    workdir/lockfile
    """
    f = '%s/%s' % (workdir, lockfile)
    return os.path.isfile(f)


def verifyTransfer(workdir, verbose=True):
    """ verify that all files were transferred by checking the existance of the ALLFILESTRANSFERRED lockfile """

    status = False
    fname = "%s/ALLFILESTRANSFERRED" % (workdir)
    if os.path.exists(fname):
        if verbose:
            tolog("Verified: %s" % (fname))
        status = True
    else:
        if verbose:
            tolog("Transfer verification failed: %s (file does not exist)" % (fname))

    return status

def removeLEDuplicates(logMsg):
    """ identify duplicated messages in the log extracts and remove them """

    # first create a new log extracts list that does not have the time stamps
    # (which will be different for the otherwise different messages)
    # E.g.:
    # 31 Mar 2008 01:32:37| !!WARNING!!1999!! Could not read modification time of ESD.020072._04023.pool.root.9
    # 31 Mar 2008 02:03:08| !!WARNING!!1999!! Could not read modification time of ESD.020072._04023.pool.root.9
    # should only be printed once,
    # 31 Mar 2008 01:32:37| !!WARNING!!1999!! Could not read modification time of ESD.020072._04023.pool.root.9

    log_extracts_list = logMsg.split('\n')

    # create a temporary list with stripped timestamp fields
    log_extracts_tmp = []
    pattern = re.compile(r"(\d+ [A-Za-z]+ \d+ \d+:\d+:\d+\|)")
    for line in log_extracts_list:
        # id the time stamp
        found = re.findall(pattern, line)
        if len(found) > 0:
            # remove any time stamp
            line = line.replace(found[0], '')
        log_extracts_tmp.append(line)

    # remove duplicate lines and create an index list to know where the original line was
    # (we want to bring back the timestamp)
    # do not use dictionaries since they are not sorted
    i = 0
    log_extracts_index = []
    log_extracts_tmp2 = []
    for line in log_extracts_tmp:
        if line not in log_extracts_tmp2:
            log_extracts_index.append(i)
            log_extracts_tmp2.append(line)
        i += 1

    # create the final list
    log_extracts_tmp = []
    for index in log_extracts_index:
        log_extracts_tmp.append(log_extracts_list[index])

    # return the stripped logMsg
    return "\n".join(log_extracts_tmp)

def writeTimeStampToFile(path="", filename="", overwrite=True):
    """ Write the current time stamp to file """

    if filename == "":
        filename = "START_TIME"
    if path == "":
        path = os.getcwd()

    _filename = os.path.join(path, filename)

    # Are we allowed to overwrite?
    proceed = False
    if overwrite:
        proceed = True
    else:
        # Only proceed if the file does not exist already
        if not os.path.exists(_filename):
            proceed = True

    if proceed:
        writeToFile(_filename, timeStampUTC(format='%Y-%m-%d %H:%M:%S'))

def writeToFile(filename, s):
    """ Write string s to file """

    # Ignore write status
    status = writeToFileWithStatus(filename, s)

def writeToFileWithStatus(filename, s, attribute="w"):
    """ Write string s to file with status return """

    status = False

    try:
        f = open(filename, attribute)
    except Exception, e:
        tolog("!!WARNING!!2990!! Could not open: %s, %s" % (filename, e))
    else:
        f.write("%s" % (s))
        f.close()
        tolog('Wrote string "%s" to file: %s' % (s.replace('\n',''), filename))
        status = True

    return status

def readCodeFromFile(filename):
    """ Wead exit code from file <workdir>/EXITCODE """

    ec = 0
    if os.path.exists(filename):
        try:
            f = open(filename, "r")
        except Exception, e:
            tolog("Failed to open %s: %s" % (filename, e))
        else:
            ec = int(f.read())
            tolog("Found code %d in file %s" % (ec, filename))
            f.close()
    else:
        tolog("No code to report (file %s does not exist)" % (filename))
    return ec

def readStringFromFile(filename):
    """ read exit code from file <workdir>/EXITCODE """

    s = ""
    if os.path.exists(filename):
        try:
            f = open(filename, "r")
        except Exception, e:
            tolog("Failed to open %s: %s" % (filename, e))
        else:
            s = f.read()
            tolog("Found string %s in file %s" % (s, filename))
            f.close()
    else:
        tolog("No string to report (file %s does not exist)" % (filename))
    return s

def verifyQueuedata(queuename, filename, _i, _N, url):
    """ check if the downloaded queuedata has the proper format """

    hasQueuedata = False
    try:
        f = open(filename, "r")
    except Exception, e:
        tolog("!!WARNING!!1999!! Open failed with %s" % e)
    else:
        output = f.read()
        f.close()
        if not ('appdir' in output and 'copytool' in output):
            if len(output) == 0:
                tolog("!!WARNING!!1999!! curl command returned empty queuedata (wrong queuename %s?)" % (queuename))
            else:
                tolog("!!WARNING!!1999!! Attempt %d/%d: curl command did not return valid queuedata from config DB server %s" %\
                      (_i, _N, url))
                output = output.replace('\n', '')
                output = output.replace(' ', '')
                tolog("!!WARNING!!1999!! Output begins with: %s" % (output[:64]))
            try:
                os.remove(filename)
            except Exception, e:
                tolog("!!WARNING!!1999!! Failed to remove file %s: %s" % (filename, e))
        else:
            # found valid queuedata info, break the for-loop
            tolog("schedconfigDB returned: %s" % (output))
            hasQueuedata = True

    return hasQueuedata

def isSameType(trf, userflag):
    """ is the lost job of same type as the current pilot? """

    # treat userflag 'self' as 'user'
    if userflag == 'self':
        userflag = 'user'

    if (isAnalysisJob(trf) and userflag == 'user') or \
           (not isAnalysisJob(trf) and userflag != 'user'):
        sametype = True
        if userflag == 'user':
            tolog("Lost job is of same type as current pilot (analysis pilot, lost analysis job trf: %s)" % (trf))
        else:
            tolog("Lost job is of same type as current pilot (production pilot, lost production job trf: %s)" % (trf))
    else:
        sametype = False
        if userflag == 'user':
            tolog("Lost job is not of same type as current pilot (analysis pilot, lost production job trf: %s)" % (trf))
        else:
            tolog("Lost job is not of same type as current pilot (production pilot, lost analysis job trf: %s)" % (trf))

    return sametype

def getGuidsFromXML(dir, id=None, filename=None, metadata=""):
    """ extract the guid matching the filename from the xml, or all guids if filename not set """

    guids = []

    if metadata != "":
        metadata_filename = metadata
    else:
        # are we in recovery mode? then id is set
        if id:
            metadata_filename = "%s/metadata-%s.xml" % (dir, id)
        else:
            metadata_filename = "%s/metadata.xml" % (dir)

    xmldoc = minidom.parse(metadata_filename)
    _fileList = xmldoc.getElementsByTagName("File")
    for thisfile in _fileList:
        gpfn = str(thisfile.getElementsByTagName("lfn")[0].getAttribute("name"))
        if (filename and gpfn == filename) or (not filename):
            guid = str(thisfile.getAttribute("ID"))
            guids.append(guid)

    return guids

def addToSkipped(lfn, guid):
    """ add metadata for skipped file """

    ec = 0
    try:
        # append to skipped.xml file
        fd = open("skipped.xml", "a")
    except Exception, e:
        tolog("!!WARNING!!2999!! Exception caught: %s" % e)
        ec = -1
    else:
        fd.write('  <File ID="%s">\n' % (guid))
        fd.write("    <logical>\n")
        fd.write('      <lfn name="%s"/>\n' % (lfn))
        fd.write("    </logical>\n")
        fd.write("  </File>\n")
        fd.close()
    return ec

def addSkippedToPFC(fname, skippedfname):
    """ add skipped input file info to metadata.xml """

    ec = 0
    try:
        fd = open(skippedfname, "r")
    except Exception, e:
        tolog("!!WARNING!!2999!! %s" % e)
        ec = -1
    else:
        skippedXML = fd.read()
        fd.close()
        try:
            fdPFC = open(fname, "r")
        except Exception, e:
            tolog("!!WARNING!!2999!! %s" % e)
            ec = -1
        else:
            PFCXML = fdPFC.read()
            fdPFC.close()

    if ec == 0:
        # add the skipped file info to the end of the PFC
        PFCXML = PFCXML.replace("</POOLFILECATALOG>", skippedXML)
        PFCXML += "</POOLFILECATALOG>\n"

        # move the old PFC and create a new PFC
        try:
            os.system("mv %s %s.BAK2" % (fname, fname))
        except Exception, e:
            tolog("!!WARNING!!2999!! %s" % e)
            ec = -1
        else:
            try:
                fdNEW = open(fname, "w")
            except Exception, e:
                tolog("!!WARNING!!2999!! %s" % e)
                ec = -1
            else:
                fdNEW.write(PFCXML)
                fdNEW.close()
                tolog("Wrote updated XML with skipped file info:\n%s" % (PFCXML))
    return ec

def verifyReleaseString(release):
    """ Verify that the release (or homepackage) string is set """

    if release == None:
        release = ""
    release = release.upper()
    if release == "":
        release = "NULL"
    if release == "NULL":
        tolog("Detected unset (NULL) release/homepackage string")
    return release

class _Curl:
    """ curl class """

    # constructor
    def __init__(self):
        # path to curl
        self.path = 'curl'
        # verification of the host certificate
        self._verifyHost = True
        # modified for Titan test
        if ('HPC_Titan' in readpar("catchall")) or ('ORNL_Titan_install' in readpar("nickname")):
            self._verifyHost = False

        # request a compressed response
        self.compress = True
        # SSL cert/key
        from SiteInformation import SiteInformation
        si = SiteInformation()
        self.sslCert = si.getSSLCertificate()
        self.sslKey = self.sslCert
        # CA cert dir
        self.sslCertDir = si.getSSLCertificatesDirectory()

    # GET method
    def get(self, url, data, path):
        # make command
        com = '%s --silent --get' % self.path
        if "HPC_HPC" in readpar('catchall'):
            com += ' --tlsv1'
        com += ' --connect-timeout 100 --max-time 120'
        if not self._verifyHost:
            com += ' --insecure'
        if self.compress:
            com += ' --compressed'
        if self.sslCertDir != '':
            com += ' --capath %s' % self.sslCertDir
        if self.sslCert != '':
            com += ' --cert %s --cacert %s' % (self.sslCert, self.sslCert)
        if self.sslKey != '':
            com += ' --key %s' % self.sslKey
        #com += ' --verbose'
        # data
        if 'nJobs' in data:
            com += ' --header "Accept: application/json"'
        strData = ''
        for key in data.keys():
            strData += 'data="%s"\n' % urllib.urlencode({key:data[key]})
        jobId = ''
        if 'jobId' in data.keys():
            jobId = '_%s' % data['jobId']
        # write data to temporary config file
        # tmpName = commands.getoutput('uuidgen 2> /dev/null')
        tmpName = '%s/curl_%s%s.config' % (path, os.path.basename(url), jobId)
        try:
            tmpFile = open(tmpName, 'w')
            tmpFile.write(strData)
            tmpFile.close()
        except IOError, e:
            tolog("!!WARNING!!2999!! %s" % e)
        if os.path.exists(tmpName):
            com += ' --config %s' % tmpName
        else:
            tolog("!!WARNING!!2999!! Can not set --config option since %s could not be created, curl will fail" % tmpName)
        com += ' %s' % url
        # execute
        tolog("Executing command: %s" % (com))
        try:
            ret = commands.getstatusoutput(com)
        except Exception, e:
            tolog("!!WARNING!!1111!! Caught exception from curl command: %s" % (e))
            ret = [-1, e]
        # remove temporary file
        #os.remove(tmpName)
        return ret

    # POST method
    def post(self, url, data, path):
        # make command
        com = '%s --silent --show-error' % self.path
        if "HPC_HPC" in readpar('catchall'):
            com += ' --tlsv1'
        com += ' --connect-timeout 100 --max-time 120'
        if not self._verifyHost:
            com += ' --insecure'
        if self.compress:
            com += ' --compressed'
        if self.sslCertDir != '':
            com += ' --capath %s' % self.sslCertDir
        if self.sslCert != '':
            com += ' --cert %s --cacert %s' % (self.sslCert, self.sslCert)
        if self.sslKey != '':
            com += ' --key %s' % self.sslKey
        #com += ' --verbose'
        # data
        if 'nJobs' in data:
            com += ' --header "Accept: application/json"'
        strData = ''
        for key in data.keys():
            strData += 'data="%s"\n' % urllib.urlencode({key:data[key]})

        curl_config = '%s/curl.config' % path
        try:
            jobId = ''
            if 'jobId' in data.keys():
                jobId = '_%s' % data['jobId']
            elif 'eventRanges' in data.keys():
                eventRanges = json.loads(data['eventRanges'])
                if 'eventRanges' in eventRanges[0]:
                    jobId = '_%s' % eventRanges[0]['eventRanges'][0]['eventRangeID'].split('-')[1]
                else:
                    jobId = '_%s' % eventRanges[0]['eventRangeID'].split('-')[1]
            curl_config = '%s/curl_%s%s.config' % (path, os.path.basename(url), jobId)
        except:
            tolog("%s" % traceback.format_exc())

        # write data to temporary config file
        tmpName = curl_config
        try:
            tmpFile = open(tmpName,'w')
            tmpFile.write(strData)
            tmpFile.close()
        except IOError, e:
            tolog("!!WARNING!!2999!! %s" % e)
        if os.path.exists(tmpName):
            com += ' --config %s' % tmpName
        else:
            tolog("!!WARNING!!2999!! Can not set --config option since curl.config could not be created, curl will fail")
        com += ' %s' % url
        # execute
        tolog("Executing command: %s" % (com))
        try:
            ret = commands.getstatusoutput(com)
        except Exception, e:
            tolog("!!WARNING!!1111!! Caught exception from curl command: %s" % (e))
            ret = [-1, e]
        # remove temporary file
        #os.remove(tmpName)
        return ret

    # PUT method
    def put(self, url, data):
        # make command
        com = '%s --silent' % self.path
        if "HPC_HPC" in readpar('catchall'):
            com += ' --tlsv1'
        if not self._verifyHost:
            com += ' --insecure'
        if self.compress:
            com += ' --compressed'
        if self.sslCertDir != '':
            com += ' --capath %s' % self.sslCertDir
        if self.sslCert != '':
            com += ' --cert %s --cacert %s' % (self.sslCert, self.sslCert)
        if self.sslKey != '':
            com += ' --key %s' % self.sslKey
        #com += ' --verbose'
        # emulate PUT
        for key in data.keys():
            com += ' -F "%s=@%s"' % (key,data[key])
        com += ' %s' % url
        # execute
        tolog("Executing command: %s" % (com))
        try:
            ret = commands.getstatusoutput(com)
        except Exception, e:
            tolog("!!WARNING!!1111!! Caught exception from curl command: %s" % (e))
            ret = [-1, e]
        return ret

    def verifyHost(self, verify):
        # set _verifyHost
        self._verifyHost = verify

# send message to pandaLogger
def toPandaLogger(data):
    try:
        tpre = datetime.datetime.utcnow()
    except:
        pass
    tolog("toPandaLogger: len(data) = %d" % len(data))
    tolog("data = %s" % str(data))
    response = None

    # instantiate curl
    curl = _Curl()
    url = 'http://pandamon.cern.ch/system/loghandler'

    curlstat, response = curl.get(url, data, os.getcwd())

    try:
        tpost = datetime.datetime.utcnow()
        tolog("Elapsed seconds: %d" % ((tpost-tpre).seconds))
    except:
        pass
    try:
        if curlstat == 0:
            # parse response message
            outtxt = response.lower()
            if outtxt.find('<html>') > 0:
                if outtxt.find('read timeout') > 0:
                    tolog("!!WARNING!!2999!! Timeout on dispatcher exchange")
                else:
                    tolog("!!WARNING!!2999!! HTTP error on dispatcher exchange")
                tolog("HTTP output: %s" % (response))
                return EC_Failed, None, None

            # create the parameter list from the dispatcher response
            data, response = parseDispatcherResponse(response)

            #status = int(data['StatusCode'])
            #if status != 0:
            #    # pilotErrorDiag = getDispatcherErrorDiag(status)
            #    tolog("Dumping %s/curl.config file:" % (path))
            #    dumpFile('%s/curl.config' % (path), topilotlog=True)
        else:
            tolog("!!WARNING!!2999!! Dispatcher message curl error: %d " % (curlstat))
            tolog("Response = %s" % (response))
            tolog("Dumping curl.config file:")
            dumpFile('%s/curl.config' % (path), topilotlog=True)
            return curlstat, None, None

        return 0, data, response

    except:
        _type, value, _traceback = sys.exc_info()
        tolog("ERROR : %s %s" % ( _type, traceback.format_exc()))
        return EC_Failed, None, None

def verifyJobState(state):
    """ Make sure the state is an allowed value """

    allowed_values = ['running', 'failed', 'finished', 'holding', 'starting', 'transferring']

    if state in allowed_values:
        tolog("Job state \'%s\' is an allowed job state value" % (state))
    else:
        tolog("!!WARNING!!3333!! Job state \'%s\' is not an allowed server job state value, job can fail" % (state))
        if state == 'setup' or state == 'stagein' or state == 'stageout':
            state = 'running'
            tolog("Switched to running state for server update")
        else:
            state = 'failed'

    return state

# send message to dispatcher
def toServer(baseURL, cmd, data, path, experiment):
    """ sends 'data' using command 'cmd' to the dispatcher """

    try:
        tpre = datetime.datetime.utcnow()
    except:
        pass
    tolog("toServer: cmd = %s" % (cmd))
    tolog("toServer: len(data) = %d" % len(data))
    tolog("data = %s" % str(data))

    # make sure the job state is an allowed value
    if data.has_key('state'):
        data['state'] = verifyJobState(data['state'])

    # instantiate curl
    curl = _Curl()

    # use insecure for dev server
    #if 'voatlas220' in baseURL:
    #    curl.verifyHost(False)

    # execute
    if cmd == "getStatus":
        url = baseURL
    else:
        url = baseURL + '/' + cmd
    curlstat, response = curl.post(url, data, path)
    try:
        tpost = datetime.datetime.utcnow()
        tolog("Elapsed seconds: %d" % ((tpost-tpre).seconds))
    except:
        pass

    curl_config = '%s/curl.config' % path
    try:
        jobId = ''
        if 'jobId' in data.keys():
            jobId = '_%s' % data['jobId']
        elif 'eventRanges' in data.keys():
            jobId = '_%s' % data['eventRanges'][0]['eventRangeID'].split('-')[1]
        curl_config = '%s/curl_%s%s.config' % (path, os.path.basename(url), jobId)
    except:
        pass

    try:
        if curlstat == 0:
            # parse response message
            outtxt = response.lower()
            if outtxt.find('<html>') > 0:
                if outtxt.find('read timeout') > 0:
                    tolog("!!WARNING!!2999!! Timeout on dispatcher exchange")
                else:
                    tolog("!!WARNING!!2999!! HTTP error on dispatcher exchange")
                tolog("HTTP output: %s" % (response))
                return EC_Failed, None, None

            # create the parameter list from the dispatcher response
            data, response = parseDispatcherResponse(response)
            # update the dispatcher data for Event Service merge jobs
            if experiment != "": # experiment is only set for GETJOB, skip this otherwise
                data = updateDispatcherData4ES(data, experiment, path)

            if 'StatusCode' not in data:
                status = EC_Failed
                tolog("!!WARNING!!2999!! Dispatcher response: %s" % data)
            else:
                status = int(data['StatusCode'])
            if status != 0:
                # pilotErrorDiag = getDispatcherErrorDiag(status)
                tolog("Dumping curl config file: %s" % (curl_config))
                dumpFile(curl_config, topilotlog=True)
        else:
            tolog("!!WARNING!!2999!! Dispatcher message curl error: %d " % (curlstat))
            tolog("Response = %s" % (response))
            tolog("Dumping curl.config file: %s" % curl_config)
            dumpFile(curl_config, topilotlog=True)
            return curlstat, None, None
        if status == 0:
            return status, data, response
        else:
            return status, None, None
    except:
        _type, value, _traceback = sys.exc_info()
        tolog("ERROR %s : %s %s" % (cmd, _type, traceback.format_exc()))
        return EC_Failed, None, None

def getPilotToken(tofile=False):
    """ read the pilot token from file """

    pilottoken = None
    filename = "pilottoken.txt"
    if os.path.exists(filename):
        try:
            f = open(filename, "r")
        except Exception, e:
            tolog("!!WARNING!!2999!! Could not open pilot token file: %s" % e, tofile=tofile)
        else:
            try:
                pilottoken = f.read()
            except Exception, e:
                tolog("!!WARNING!!2999!! Could not read pilot token: %s" % e, tofile=tofile)
            else:
                f.close()
                tolog("Successfully read pilot token", tofile=tofile)
                try:
                    os.remove(filename)
                except Exception, e:
                    tolog("!!WARNING!!2999!! Could not remove pilot token file: %s" % e, tofile=tofile)
                else:
                    tolog("Pilot token file has been removed", tofile=tofile)

    return pilottoken

def removeSubFromResponse(response):
    """ Remove any _subNNN strings from the dataset variables (realDatasets and destinationDblock) """

    tolog("response='%s'"%(response))
    pattern = re.compile('\S+(\_sub[0-9]+)')
    match = pattern.match(response)
    if match:
        # strip away the _subNNN string
        try:
            response = response.replace(match.group(1), '')
        except Exception, e:
            tolog("!!WARNING!!1119!! Failed to remove _sub string (%s) from dispatcher response: %s" % (match.group(1), e))
        else:
            tolog("Updated dispatcher response (removed %s): %s" % (match.group(1), response))
    else:
        tolog("Found no _subNNN string in the dispatcher response")

    return response

def createESFileDictionary(writeToFile):
    """ Create the event range file dictionary from the writeToFile info """

    # writeToFile = 'fileNameForTrf_1:LFN_1,LFN_2^fileNameForTrf_2:LFN_3,LFN_4'
    # -> esFileDictionary = {'fileNameForTrf_1': 'LFN_1,LFN_2', 'fileNameForTrf_2': 'LFN_3,LFN_4'}
    # Also, keep track of the dictionary keys (e.g. 'fileNameForTrf_1') ordered since we have to use them to update the jobParameters
    # once we know the full path to them (i.e. '@fileNameForTrf_1:..' will be replaced by '@/path/filename:..')
    # (the dictionary is otherwise not ordered so we cannot simply use the dictionary keys later)
    # fileInfo = ['fileNameForTrf_1:LFN_1,LFN_2', 'fileNameForTrf_2:LFN_3,LFN_4']

    fileInfo = writeToFile.split("^")
    esFileDictionary = {}
    orderedFnameList = []
    for i in range(len(fileInfo)):
        # Extract the file name
        if ":" in fileInfo[i]:
            finfo = fileInfo[i].split(":")

            # add cwd before the lfns
            #finfo[1] = "`pwd`/" + finfo[1]
            #finfo[1] = finfo[1].replace(',',',`pwd`/')

            # fix the issue that some athena 20 releases have _000 at the end of the filename
            if finfo[0].endswith("_000"):
                tolog("replace %s with %s" % (finfo[0], finfo[0][:-4]))
                finfo[0] = finfo[0][:-4]
            esFileDictionary[finfo[0]] = finfo[1]
            orderedFnameList.append(finfo[0])
        else:
            tolog("!!WARNING!!4444!! File info does not have the correct format, expected a separator \':\': %s" % (fileInfo[i]))
            esFileDictionary = {}
            break

    return esFileDictionary, orderedFnameList

def writeToInputFile(path, esFileDictionary, orderedFnameList, eventservice=True):
    """ Write the input file lists to the proper input file """
    # And populate the fname file dictionary
    # fnames = { 'identifier': '/path/filename', .. }
    # where 'identifier' will be present in the jobParameters, like @identifier. This will later be replaced by the proper file path

    # To be used by the TRF instead of a potentially very long LFN list
    # esFileDictionary = {'fileNameForTrf_1': 'LFN_1,LFN_2', 'fileNameForTrf_2': 'LFN_3,LFN_4'}
    # 'LFN_1,LFN_2' will be written to file 'fileNameForTrf_1', etc

    ec = 0
    fnames = {}
    i = 0
    for fname in esFileDictionary.keys():
        _path = os.path.join(path, fname.replace('.pool.root.', '.txt.'))  # not necessary?
        try:
            f = open(_path, "w")
        except IOError, e:
            tolog("!!WARNING!!4445!! Failed to open file %s: %s" % (_path, e))
            ec = -1
        else:
            if eventservice:
                f.write("--inputHitsFile\n")
            else: # For Reco
                f.write("--inputAODFile\n")
            for inputFile in esFileDictionary[fname].split(","):
                pfn = os.path.join(path, inputFile)
                f.write(pfn + "\n")
            #f.write(esFileDictionary[fname].replace(",","\n"))
            f.close()
            tolog("Wrote input file list to file %s: %s" % (_path, str(esFileDictionary[fname])))
            fnames[fname] = _path
        i += 1

    return ec, fnames

def getWriteToInputFilenames(writeToFile):
    """
    Extract the writeToFile file name(s).
    writeToFile='tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00:AOD.15760866._000002.pool.root.1'
    -> return 'tmpin_mc16_13TeV.345935.PhPy8EG_A14_ttbarMET100_200_hdamp258p75_nonallhad.merge.AOD.e6620_e5984_s3126_r10724_r10726_tid15760866_00'

    :param writeToFile:
    :return:
    """

    filenames = []
    entries = writeToFile.split('^')
    for entry in entries:
        if ':' in entry:
            name = entry.split(":")[0]
            _name = name.replace('.pool.root.', '.txt.')  # not necessary?
            filenames.append(_name)

    return filenames

def updateESGUIDs(guids):
    """ Update the NULL valued ES guids """
    # necessary since guids are used as dictionary keys in some places

    # replace the NULL values with different values
    # guids = 'NULL,NULL,NULL,sasdasdasdasdd'
    # -> 'DUMMYGUID0,DUMMYGUID1,DUMMYGUID2,sasdasdasdasdd'

    for i in range(guids.count('NULL')):
        guids = guids.replace('NULL', 'DUMMYGUID%d' % (i), 1)

    return guids

def getESInputFiles(esFileDictionary):
    """ Get all the input files from all the keys in the event range file dictionary """
    # Concaternated into one file list, e.g.
    # esFileDictionary = {'fileNameForTrf_1': 'LFN_1,LFN_2', 'fileNameForTrf_2': 'LFN_3,LFN_4'}
    # -> 'LFN_1,LFN_2','LFN_3,LFN_4'

    files = ""
    for fname in esFileDictionary.keys():
        if files != "":
            files += ","
        files += esFileDictionary[fname]

    return files

def updateJobPars(jobPars, fnames):
    """ Replace the @identifiers with the full paths """

    for identifier in fnames.keys():
        jobPars = jobPars.replace("@%s" % (identifier), "@%s" % (fnames[identifier]))
        tolog("%s: %s" % (identifier, fnames[identifier]))
    s = ["--inputHitsFile=", "--inputAODFile="]
    for t in s:
        if t in jobPars:
            jobPars = jobPars.replace(t, "")
            break

    return jobPars

def updateDispatcherData4ES(data, experiment, path):
    """ Update the input file list for Event Service merge jobs """

    # For Event Service merge jobs, the input file list will not arrive in the inFiles
    # list as usual, but in the writeToFile field, so inFiles need to be corrected
    # path = pilot_init dir, so we know where the file list files are written
    # data = data={'jobsetID': '2235472772', .. }

    # The input file list should be updated
    if data.has_key('writeToFile'):
        writeToFile = data['writeToFile']
        esFileDictionary, orderedFnameList = createESFileDictionary(writeToFile)
        tolog("esFileDictionary=%s" % (esFileDictionary))
        tolog("orderedFnameList=%s" % (orderedFnameList))
        if esFileDictionary != {}:
            """
            # Replace the @inputFor* directorive with the file list
            for name in orderedFnameList:
                tolog("Replacing @%s with %s" % (name, esFileDictionary[name]))
                data['jobPars'] = data['jobPars'].replace("@%s" % (name), esFileDictionary[name])
            """

            # fix the issue that some athena 20 releases have _000 at the end of the filename
            for name in orderedFnameList:
                name_000 = "@%s_000 " % (name)
                new_name = "@%s " % (name)
                if name_000 in data['jobPars']:
                    tolog("%s in jobPars, replace it with %s" % (name_000, new_name))
                    data['jobPars'] = data['jobPars'].replace(name_000, new_name)

            if data.has_key('eventServiceMerge') and data['eventServiceMerge'].lower() == "true":
                eventservice = True
            else:
                eventservice = False
            if not eventservice:  # note: for ES there is a different workflow; see Job.py (writeToFile)
                ec, fnames = writeToInputFile(path, esFileDictionary, orderedFnameList, eventservice)

            # Remove the autoconf
            if "--autoConfiguration=everything " in data['jobPars']:
                data['jobPars'] = data['jobPars'].replace("--autoConfiguration=everything ", " ")
            # Write event service file lists to the proper input file
            #ec, fnames = writeToInputFile(path, esFileDictionary, orderedFnameList)
            ec = 0
            if ec == 0:
                #inputFiles = getESInputFiles(esFileDictionary)

                # Update the inFiles list (not necessary??)
                #data['inFiles'] = inputFiles

                # Correct the dsname?
                # filesize and checksum? not known (no file catalog)

                # Replace the NULL valued guids for the ES files
                data['GUID'] = updateESGUIDs(data['GUID'])

                # Replace the @identifiers in the jobParameters
                #data['jobPars'] = updateJobPars(data['jobPars'], fnames)

                # Update the copytoolin (should use the proper objectstore site mover)
                si = getSiteInformation(experiment)
                if not os.environ.has_key('Nordugrid_pilot') and data.has_key('eventServiceMerge'):
                    ec = si.replaceQueuedataField("copytoolin", "objectstore")

            else:
                tolog("Cannot continue with event service merge job")
        else:
            tolog("Empty event range dictionary")
    else:
        tolog("writeToFile not present in job def")

    return data

def parseDispatcherResponse(response):
    """ Create the parameter list from the dispatcher response """

# use this when listFilesInDataset usage is not needed any more (v 51b)
#    # remove any _subNNN strings if necessary (from dataset names)
#    if "_sub" in response:
#        response = removeSubFromResponse(response)

    try:
        parList = json.loads(response)
    except:
        data = {}
        parList = cgi.parse_qsl(response, keep_blank_values=True)
        for p in parList:
            data[p[0]] = p[1]

        if 'userProxy' in str(parList) or 'privateKey' in str(parList):
            for i in range(len(parList)):
                if parList[i][0] == 'userProxy' or parList[i][0] == 'publicKey' or parList[i][0] == 'privateKey':
                    newList = list(parList[i])
                    newList[1] = 'hidden'
                    parList[i] = newList
    else:
        data = parList.copy()
        if 'jobs' in parList:
            for p in parList['jobs']:
                if 'userProxy' in p:
                     p['userProxy'] = 'hidden'
                if 'privateKey' in p:
                    p['privateKey'] = 'hidden'
                if 'publicKey' in p:
                    p['publicKey'] = 'hidden'

    tolog("Dispatcher response: %s" % str(parList))

    return data, response

def grep(patterns, file_name):
    """ Search for the patterns in the given list in a file """
    # Example:
    # grep(["St9bad_alloc", "FATAL"], "athena_stdout.txt")
    # -> [list containing the lines below]
    #   CaloTrkMuIdAlg2.sysExecute()             ERROR St9bad_alloc
    #   AthAlgSeq.sysExecute()                   FATAL  Standard std::exception is caught

    matched_lines = []
    p = []
    for pattern in patterns:
        p.append(re.compile(pattern))

    try:
        f = open(file_name, "r")
    except IOError, e:
        tolog("!!WARNING!!2999!! %s" % e)
    else:
        while True:
            # get the next line in the file
            line = f.readline()
            if not line:
                break

            # can the search pattern be found
            for cp in p:
                if re.search(cp, line):
                    matched_lines.append(line)
        f.close()
    return matched_lines

def getJobReport(filename):
    """ Extract the job report from the stdout, or the last N lines """

    report = ""
    if os.path.exists(filename):
        pattern = re.compile("Job Report produced by")
        try:
            f = open(filename, "r")
        except IOError, e:
            tolog("!!WARNING!!1299!! %s" % e)
        else:
            matched_lines = []
            status = True
            first_report = True
            while status:
                # get the next line in the file
                line = f.readline()
                if not line:
                    break

                # find the start position of the job report and grab all remaining lines
                if re.search(pattern, line) and first_report:
                    # the job report is repeated, only grab it the second time it appears
                    first_report = False
                elif re.search(pattern, line) and not first_report:
                    # save the job report title line
                    line = line.replace("=====", "-")
                    matched_lines.append(line)
                    while True:
                        line = f.readline()
                        if not line:
                            status = False
                            break
                        matched_lines.append(line)

            # grab the last couple of lines in case the trf failed before the job report was printed
            if len(matched_lines) == 0:
                N = 10
                tolog("Job report could not be found in the payload stdout, will add the last %d lines instead for the log extracts" % (N))
                report = "- Last %d lines from %s -\n" % (N, filename)
                report = report + tail(filename, N)
            else:
                report = "".join(matched_lines)
            f.close()
    else:
        tolog("!!WARNING!!1299!! File %s does not exist" % (filename))

    return report

def tail(filename, number_of_lines):
    """ Grab the last N lines from a file """

    report = ""
    if os.path.exists(filename):
        try:
            # U is to open it with Universal newline support
            f = open(filename, "rU")
        except IOError, e:
            tolog("!!WARNING!!1299!! %s" % e)
        else:
            read_size = 1024
            offset = read_size
            f.seek(0, 2)
            file_size = f.tell()

            # abort if zero file size
            if file_size == 0:
                tolog("!!WARNING!!1299!! File %s has zero size" % (filename))
            else:
                # loop over file
                while True:
                    if file_size < offset:
                        offset = file_size
                    f.seek(-offset, 2)
                    read_str = f.read(offset)

                    try:
                        # Remove newline at the end
                        if read_str[offset - 1] == '\n':
                            read_str = read_str[:-1]
                        lines = read_str.split('\n')

                        # Got number_of_lines lines
                        if len(lines) >= number_of_lines:
                            report = "\n".join(lines[-number_of_lines:])
                            break
                    except Exception, e:
                        # the following message will be visible in the log extracts
                        report = "!!WARNING!!1299!! tail command caught an exception when reading payload stdout: %s" % e
                        tolog(report)
                        break

                    # Reached the beginning
                    if offset == file_size:
                        report = read_str
                        break
                    offset += read_size
            f.close()
    else:
        tolog("!!WARNING!!1299!! File %s does not exist" % (filename))

    return report

def filterJobReport(report):
    """ Extract the exit and error code from the job report """

    filtered_report = ""
    filters = ["ExitCode", "ErrorCode"]
    patterns = []
    for _filter in filters:
        patterns.append(re.compile(_filter))

    # loop over the full report
    if report != "":
        header = True
        for line in report.split("\n"):
            # grab the header line
            if header:
                filtered_report += line + "\n"
                header = False
            # match the exit and error code lines
            for pattern in patterns:
                if re.search(pattern, line):
                    filtered_report += line + "\n"
    else:
        tolog("!!WARNING!!2999!! Found empty job report")

    return filtered_report

def removeDuplicates(seq):
    """ Order preserving duplicate removal """

    checked = []
    for entry in seq:
        if entry not in checked:
            checked.append(entry)
    return checked

def dumpOrderedItems(l):
    """ dump list l """

    _i = 0
    for item in l:
        _i += 1
        if item == "":
            tolog("%d. <empty>" % (_i))
        else:
            tolog("%d. %s" % (_i, item))

def getDatasetDict(outputFiles, destinationDblock, logFile, logFileDblock, archive=False):
    """ Create a dataset dictionary """

    datasetDict = None

    # verify that the lists are of equal size
    if len(outputFiles) != len(destinationDblock):
        tolog("WARNING: Lists are not of same length: len(outputFiles)=%d, len(destinationDblock)=%d" % (len(outputFiles), len(destinationDblock)))
    elif len(outputFiles) == 0:
        tolog("No output files for this job (outputFiles has zero length)")
    elif len(destinationDblock) == 0:
        tolog("WARNING: destinationDblock has zero length")
    else:
        # verify that lists contains valid entries
        _l = [outputFiles, destinationDblock]
        ok = True
        for _list in _l:
            for _entry in _list:
                if _entry == "NULL" or _entry == "" or _entry == " " or _entry == None:
                    # ignore if archive
                    if not archive:
                        tolog("!!WARNING!!2999!! Found non-valid entry in list: %s" % str(_list))
                        ok = False
                        break

        if ok:
            # build the dictionary
            try:
                datasetDict = dict(zip(outputFiles, destinationDblock))
            except Exception, e:
                tolog("!!WARNING!!2999!! Exception caught in getDatasetDict(): %s" % e)
                datasetDict = None
            else:
                # add the log file info
                datasetDict[logFile] = logFileDblock

    return datasetDict

def getFileGuid(metadata_filename, guid_filename):
    """ read the log guid from metadata """

    logFileGuid = ""

    if os.path.exists(metadata_filename):
        try:
            xmldoc = minidom.parse(metadata_filename)
            _fileList = xmldoc.getElementsByTagName("File")
            for thisfile in _fileList:
                lfn = str(thisfile.getElementsByTagName("lfn")[0].getAttribute("name"))
                _guid = str(thisfile.getAttribute("ID"))
                if guid_filename == lfn:
                    logFileGuid = _guid
                    tolog("Guid %s belongs to file %s" % (_guid, lfn))
        except Exception, e:
            tolog("!!WARNING!!2999!! Could not parse the metadata - guids unknown: %s" % (e))
    else:
        tolog("!!WARNING!!2999!! Could not locate %s, log file guid can not be verified" % (metadata_filename))

    return logFileGuid

def tailPilotErrorDiag(pilotErrorDiag, size=256):
    """ Return the last n characters of pilotErrorDiag """

    try:
        return pilotErrorDiag[-size:]
    except:
        return pilotErrorDiag

def headPilotErrorDiag(pilotErrorDiag, size=256):
    """ Return the first n characters of pilotErrorDiag """

    try:
        return pilotErrorDiag[:size]
    except:
        return pilotErrorDiag

def getMaxInputSize(MB=False):
    """ Return a proper maxinputsize value """

    _maxinputsize = readpar('maxwdir') # normally 14336+2000 MB
    MAX_INPUT_FILESIZES = 14*1024*1024*1024 # 14 GB, 14336 MB (pilot default)
    MAX_INPUT_FILESIZES_MB = 14*1024 # 14336 MB (pilot default)
    if _maxinputsize != "":
        try:
            if MB: # convert to MB int
                _maxinputsize = int(_maxinputsize) # MB
            else: # convert to B int
                _maxinputsize = int(_maxinputsize)*1024*1024 # MB -> B
        except Exception, e:
            tolog("!!WARNING!!2999!! schedconfig.maxinputsize: %s" % e)
            if MB:
                _maxinputsize = MAX_INPUT_FILESIZES_MB
            else:
                _maxinputsize = MAX_INPUT_FILESIZES
    else:
        if MB:
            _maxinputsize = MAX_INPUT_FILESIZES_MB
        else:
            _maxinputsize = MAX_INPUT_FILESIZES

    if MB:
        tolog("Max input size = %d MB (pilot default)" % (_maxinputsize))
    else:
        tolog("Max input size = %d B (pilot default)" % (_maxinputsize))

    return _maxinputsize

def getTimeFloor(timefloor_default):
    """ Return a proper timefloor """
    # timefloor is the time limit within which the pilot is allowed to run multiple jobs
    # if at the end of a job, there is enough time (i.e. at least [timefloor] s left), the pilot
    # will ask for another job

    try:
        if timefloor_default != None:
            timefloor = timefloor_default
            tolog("(Overriding any schedconfig.timefloor with timefloor set by pilot option -C %d)" % (timefloor_default))
        else:
            timefloor = int(readpar('timefloor'))*60 # assumed to be in minutes, convert into seconds
    except:
        tolog("Timefloor not set in queuedata (multi-jobs disabled)")
        timefloor = 0
    else:
        _lower = 0
        _upper = 60*60*24 # one day
        if timefloor == 0:
            tolog("Timefloor set to zero in queuedata (multi-jobs disabled)")
        elif timefloor > _lower and timefloor <= _upper:
            tolog("Timefloor set to %d s" % (timefloor))
        else:
            tolog("Timefloor (%d s) out of limits (%d s, %d s) - multi-jobs disabled" % (timefloor, _lower, _upper))
            timefloor = 0

    return timefloor

def getCopysetup(mode="get"):
    """ extract a verified copysetup[in] script from queuedata """

    copysetup_tmp = readpar('copysetup')

    if mode == "get":
        _copysetup = readpar('copysetupin')
        if _copysetup == "":
            # not set, use same copysetup for stage-in as for stage-out
            _copysetup = copysetup_tmp
    else:
        _copysetup = copysetup_tmp

    # copysetup can contain ^-signs for remote i/o sites
    if _copysetup.find('^') > -1:
        copysetup = _copysetup.split('^')[0]
    else:
        copysetup = copysetup_tmp

    # make sure that the script exists
    if copysetup == "":
        tolog("No copysetup found")
    elif not os.path.exists(copysetup) and '^' not in copysetup:
        tolog("!!WARNING!!2998!! copysetup does not exist: %s (reset to empty string)" % (copysetup))
        copysetup = ""
    elif '^' in copysetup:
        tolog("No path in copysetup (%s, reset to empty string)" % (copysetup))
        copysetup = ""
    else:
        tolog("Extracted copysetup: %s" % (copysetup))

    return copysetup

def verifyLFNLength(outputFiles):
    """ Make sure that the LFNs are all within the allowed length """

    ec = 0
    pilotErrorDiag = ""
    error = PilotErrors()
    MAXFILENAMELENGTH = 255

    # loop over all output files
    for fileName in outputFiles:
        if len(fileName) > MAXFILENAMELENGTH:
            pilotErrorDiag = "LFN too long (length: %d, must be less than %d characters): %s" % (len(fileName), MAXFILENAMELENGTH, fileName)
            tolog("!!WARNING!!2994!! %s" % (pilotErrorDiag))
            ec = error.ERR_LFNTOOLONG
        else:
            tolog("LFN length verified for file %s" % (fileName))

    return ec, pilotErrorDiag

def isLogfileCopied(workdir, jobId=None):
    """ check whether the log file has been copied or not """

    if jobId:
        if os.path.exists(workdir + '/LOGFILECOPIED_%s' % jobId):
            return True
        else:
            return False
    else:
        if os.path.exists(workdir + '/LOGFILECOPIED'):
            return True
        else:
            return False

def isLogfileRegistered(workdir):
    """ check whether the log file has been registered or not """

    if os.path.exists(workdir + '/LOGFILEREGISTERED'):
        return True
    else:
        return False


def updateJobState(job, site, workNode, recoveryAttempt=0):
    """ update the job state file """

    status = True

    # create a job state object and give it the current job state information
    from JobState import JobState
    JS = JobState()
    if JS.put(job, site, workNode, recoveryAttempt):
        if recoveryAttempt > 0:
            tolog("Successfully updated job state file (recovery attempt number: %d)" % (recoveryAttempt))
        else:
            tolog("Successfully updated job state file at: %s" % (JS.getCurrentFilename()))
    else:
        tolog("!!WARNING!!1000!! Failed to update job state file")
        status = False

    return status

def chdir(dir):
    """ keep track of where we are... """

    os.chdir(dir)
    tolog("current dir: %s" % (os.getcwd()))

def processDBRelease(inputFiles, inFilesGuids, realDatasetsIn, dispatchDblock, dispatchDBlockToken, prodDBlockToken, workdir, jobPars):
    """ remove any DBRelease files from the input file list and send back instruction to move the created DBRelease file to job dir """

    _inputFiles = inputFiles
    _inFilesGuids = inFilesGuids
    _realDatasetsIn = realDatasetsIn
    _dispatchDblock = dispatchDblock
    _dispatchDBlockToken = dispatchDBlockToken
    _prodDBlockToken = prodDBlockToken

    # are there any DBRelease files in the input file list?
    has_DBRelease_files = False
    from DBReleaseHandler import DBReleaseHandler
    dbh = DBReleaseHandler(workdir=workdir)

    # abort if no local DBRelease dir
    if dbh.getDBReleaseDir() == "":
        return _inputFiles, _inFilesGuids, _realDatasetsIn, _dispatchDblock, _dispatchDBlockToken, _prodDBlockToken

    for f in inputFiles:
        # if the DBRelease version can be extracted from the file name, then the file is a DBRelease file..
        if dbh.extractVersion(f):
            tolog("Found a DBRelease file: %s" % (f))
            has_DBRelease_files = True
            break

    if not has_DBRelease_files:
        tolog("No DBRelease files found in input file list")
    else:
        tolog("Found a DBRelease file in the input file list (will check local availability)")

        # get the DBRelease version
        # for testing: version = dbh.getDBReleaseVersion(jobPars=jobPars+" DBRelease-9.0.1.tar.gz")
        version = dbh.getDBReleaseVersion(jobPars=jobPars)

        # create the skeleton DBRelease tarball
        if dbh.createDBRelease(version, workdir):
            # update the input file list
            _inputFiles, _inFilesGuids, _realDatasetsIn, _dispatchDblock, _dispatchDBlockToken, _prodDBlockToken = \
                         dbh.removeDBRelease(list(inputFiles), list(inFilesGuids), list(realDatasetsIn), list(dispatchDblock), list(dispatchDBlockToken), list(prodDBlockToken))

    return _inputFiles, _inFilesGuids, _realDatasetsIn, _dispatchDblock, _dispatchDBlockToken, _prodDBlockToken

def updateXMLWithEndpoints(xml, filenames, endpoints):
    """ Replace any 'tobeset' strings in the XML with final ddm endpoints """

    # Input:  xml (string)
    #         filenames (list of strings)
    #         endpoints (list of strings, 1-to-1 mapped to the filenames list)
    # Output: xml (string)
    # Description: The method looks for patterns '<filename>-ddmendpoint_tobeset' and
    #              replaces it with the corresponding 'endpoint'

    lines = []
    for line in xml.split('\n'):
        i = 0
        s = False
        for filename in filenames:
            p = filename + '-ddmendpoint_tobeset'
            if p in line:
                if endpoints[i] == None:
                    # remove this line without append
                    s = True
                else:
                    lines.append(line.replace(p, endpoints[i]))
                    s = True
                break
            i += 1
        if not s:
            lines.append(line)

    if lines == []:
        r = xml
    else:
        r = '\n'.join(lines)
    return r

def updateXMLWithSURLs(experiment, node_xml, workDir, jobId, jobrec, format=''):
    """ update the XML with the SURLs """

    xml = ""

    # read back the SURL dictionary
    from SiteMover import SiteMover
    sitemover = SiteMover()
    surlDictionary = sitemover.getSURLDictionary(workDir, jobId)

    # get the experiment object
    thisExperiment = getExperiment(experiment)

    tolog("node_xml = %s" % (node_xml))

    node_xml_list = node_xml.split("\n")
    # loop over the xml and update where it is needed
    if surlDictionary != {}:
        if format == 'NG':
            re_tobeset = re.compile('\<surl\>([a-zA-Z0-9-]+)\-surltobeset')
        else:
            if thisExperiment:
                metadata_attr_name = thisExperiment.getAttrForRegistration()
            else:
                metadata_attr_name = "surl"
            re_tobeset = re.compile('\<metadata att\_name\=\"%s\" att\_value\=\"([a-zA-Z0-9-]+)\-surltobeset\"\/\>' % (metadata_attr_name))
        for line in node_xml_list:
            tobeset = re_tobeset.search(line)
            if tobeset:
                # extract the guid and surl
                guid = tobeset.group(1)
                # note: in case of an earlier transfer error, the corresponding guid will not be in the surl dictionary
                # since it is only written to the surl dictionary for successful transfers
                try:
                    surl = surlDictionary[guid]
                except Exception, e:
                    tolog("!!WARNING!!2996!! Encountered a missing guid in the surl dictionary - did the corresponding transfer fail? guid = %s, %s" % (guid, e))
                    # add the 'surltobeset' line when job recovery is used
                    if jobrec:
                        xml += line + "\n"
                else:
                    # replace the guid and the "surltobeset"-string with the surl
                    if surl and surl != "":
                        xml += line.replace(guid + "-surltobeset", surl) + "\n"
                    else:
                        tolog("!!WARNING!!2996!! Could not extract guid %s from xml line: %s" % (guid, line))
            # fail safe in case something went wrong above, remove the guid+surltobeset line
            elif "surltobeset" in line:
                tolog("Failed to remove surltobeset from line: %s" % (line))
            else:
                xml += line + "\n"

    else:
        tolog("!!WARNING!!2997!! Encountered an empty SURL dictionary")

        # remove the metadata for the SURL since it cannot be updated
        for line in node_xml_list:
            if not jobrec:
                if not "surltobeset" in line:
                    xml += line + "\n"
            else:
                xml += line + "\n"

    if xml == "\n":
        xml = ""
        tolog("Reset XML")

    return xml

def putMetadata(workdir, jobId, strXML):
    """ """

    status = False

    filename = os.path.join(workdir, "metadata-%s.xml" % (jobId))
    try:
        f = open(filename, "w")
    except OSError, e:
        tolog("!!WARNING!!1200!! Failed to open metadata file for writing: %s" % (e))
    else:
        f.write(strXML)
        f.close()
        status = True

    return status

def get_metadata_from_xml(workdir, filename="metadata.xml"):
    """
    Parse the payload metadata.xml file.

    Example of metadata.xml:

    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE POOLFILECATALOG SYSTEM 'InMemory'>
    <POOLFILECATALOG>
      <File ID="D2A6D6F4-ADB2-B140-9C2E-D2D5C099B342">
        <logical>
          <lfn name="RDO_011a43ba-7c98-488d-8741-08da579c5de7.root"/>
        </logical>
        <metadata att_name="geometryVersion" att_value="ATLAS-R2-2015-03-01-00"/>
        <metadata att_name="conditionsTag" att_value="OFLCOND-RUN12-SDR-19"/>
        <metadata att_name="size" att_value="3250143"/>
        <metadata att_name="events" att_value="3"/>
        <metadata att_name="beamType" att_value="collisions"/>
        <metadata att_name="fileType" att_value="RDO"/>
      </File>
    </POOLFILECATALOG>

    which gives the following dictionary:

    {'RDO_011a43ba-7c98-488d-8741-08da579c5de7.root': {'conditionsTag': 'OFLCOND-RUN12-SDR-19',
    'beamType': 'collisions', 'fileType': 'RDO', 'geometryVersion': 'ATLAS-R2-2015-03-01-00', 'events': '3',
    'size': '3250143'}}

    :param workdir: payload work directory (string).
    :param filename: metadata file name (string).
    :return: metadata dictionary.
    """

    # metadata_dictionary = { lfn: { att_name1: att_value1, .. }, ..}
    metadata_dictionary = {}
    path = os.path.join(workdir, filename)
    if not os.path.exists(path):
        tolog('!!WARNING!!1212!! File does not exist: %s' % path)
        return metadata_dictionary

    tree = ElementTree.parse(path)
    root = tree.getroot()
    # root.tag = POOLFILECATALOG

    for child in root:
        # child.tag = 'File', child.attrib = {'ID': '4ACC5018-2EA3-B441-BC11-0C0992847FD1'}
        lfn = ""
        for grandchild in child:
            # grandchild.tag = 'logical', grandchild.attrib = {}
            if grandchild.tag == 'logical':
                for greatgrandchild in grandchild:
                    # greatgrandchild.tag = lfn
                    # greatgrandchild.attrib = lfn {'name': 'RDO_011a43ba-7c98-488d-8741-08da579c5de7.root'}
                    lfn = greatgrandchild.attrib.get('name')
                    metadata_dictionary[lfn] = {}
            elif grandchild.tag == 'metadata':
                # grandchild.attrib = {'att_name': 'events', 'att_value': '3'}
                name = grandchild.attrib.get('att_name')
                value = grandchild.attrib.get('att_value')
                metadata_dictionary[lfn][name] = value
            else:
                # unknown metadata entry
                pass

    return metadata_dictionary

def getMetadata(workdir, jobId, athena=False, altpath=""):
    """ read metadata from file """

    strXML = None

    if altpath == "":
        BAK = ""
        if athena:
            BAK = ".PAYLOAD"

        # are we in recovery mode? then jobId is set
        if jobId:
            filename = "metadata-%s.xml%s" % (jobId, BAK)
        else:
            filename = "metadata.xml%s" % (BAK)

        fname = os.path.join(workdir, filename)
    else:
        fname = altpath

    tolog("Trying to read metadata from file: %s" % (fname))
    if os.path.exists(fname):
        try:
            f = open(fname)
        except Exception, e:
            tolog("!!WARNING!!1000!! Can not open the file %s, %s" % (fname, e))
        else:
            strXML = ""
            for line in f:
                strXML += line
            f.close()
            if len(strXML) > 0:
                tolog("Found metadata")
            else:
                tolog("!!WARNING!!1000!! Empty metadata")
    else:
        tolog("getMetadata: metadata does not seem to have been created (file %s does not exist)" % (fname))

    return strXML

def makeJobReport(job, logExtracts, foundCoreDump, version, jobIds):
    """ Make the job summary error report. Use info from jobReport.pickle if available """

    error = PilotErrors()
    perr = job.result[2]
    terr = job.result[1]

    # was this a multi-trf job?
    nJobs = job.jobPars.count("\n")
    if nJobs > 0:
        multi_trf = True
    else:
        multi_trf = False

    tolog("..Job report..................................................................................................")
    tolog(". Pilot version             : %s" % (version))
    tolog(". Job id                    : %s" % (job.jobId))
    tolog(". Current job status        : %s" % (job.result[0]))

    if multi_trf:
        tolog(". Trf job type              : Multi-trf (%d jobs)" % (nJobs + 1))
    else:
        tolog(". Trf job type              : Single trf job")

    try: # protect against sites that run older pilots that don't have the finalstate defined
        fs = job.finalstate
        if fs != "":
            tolog(". Final job state           : %s" % (fs))
        else:
            tolog(". Final job state           : (not set, job should have failed)")
    except:
        tolog("(not set - site should update pilot distribution)")
        fs = None

    if verifyTransfer(job.workdir, verbose=False):
        tolog(". All out files transferred : Yes")
    else:
        tolog(". All out files transferred : No")
    if perr != 0:

        tolog(". Pilot error code          : %d, %s" % (perr, error.getPilotErrorDiag(perr)))
        if error.isRecoverableErrorCode(perr) and job.result[0] != "failed":
            tolog(". Job is recoverable        : Yes")
        else:
            tolog(". Job is recoverable        : No")
    else: # perr == 0
        tolog(". Pilot error code          : %d, (no pilot error)" % (perr))
        if fs == "failed" or job.result[0] == "failed":
            tolog(". Job is recoverable        : No")

    if job.pilotErrorDiag != None:
        lenPilotErrorDiag = len(job.pilotErrorDiag)
        if lenPilotErrorDiag > 250:
            tolog(". Length pilot error diag   : %d (will be truncated to 250)" % (lenPilotErrorDiag))
        else:
            tolog(". Length pilot error diag   : %d" % (lenPilotErrorDiag))
        if job.pilotErrorDiag != "":
            l = 100
            tolog(". Pilot error diag [%d:]   : %s" % (l, headPilotErrorDiag(job.pilotErrorDiag, size=l)))
        else:
            tolog(". Pilot error diag          : Empty")
    else:
        tolog(". Pilot error diag          : None")

    fname = getPilotstderrFilename()
    if os.path.exists(fname):
        _size = os.path.getsize(fname)
        if _size > 0:
            tolog(". Pilot produced stderr     : Yes (size: %d) see dump below" % (_size))
        else:
            tolog(". Pilot produced stderr     : No")

    fname = "%s/runjob.stderr" % (job.workdir)
    if os.path.exists(fname):
        _size = os.path.getsize(fname)
        if _size > 0:
            tolog(". RunJob produced stderr    : Yes (size: %d) see dump below" % (_size))
        else:
            tolog(". RunJob produced stderr    : No")

    tolog(". Trf error code            : %d" % (terr))

    # trf error should have been read from the jobInfo.xml or jobReport* files
    if terr != job.exeErrorCode:
        tolog(". Trf error code (2)        : %d" % job.exeErrorCode)
    tolog(". Trf error diagnosis       : %s" % job.exeErrorDiag)


    if (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode):
        mismatch = "exeErrorCode = %d, transExitCode = %d" %\
                   (job.exeErrorCode, job.result[1])
        tolog(". Trf error code mismatch   : %s" % mismatch)

    lenLogExtracts = len(logExtracts)
    if lenLogExtracts <= 2048:
        tolog(". Length log extracts       : %d (preliminary)" % (lenLogExtracts))
    else:
        tolog(". Length log extracts       : %d (will be truncated to 2048)" % (lenLogExtracts))

    # did the payload produce any stderr?
    if multi_trf:
        for _i in range(nJobs + 1):
            _stderr = job.stderr
            _stderr = _stderr.replace(".txt", "_%d.txt" % (_i + 1))
            filename = "%s/%s" % (job.workdir, _stderr)
            if os.path.exists(filename):
                if os.path.getsize(filename) > 0:
                    tolog(". Payload %d produced stderr : Yes (check %s)" % (_i + 1, _stderr))
                else:
                    tolog(". Payload %d produced stderr : No (empty %s)" % (_i + 1, _stderr))
            else:
                tolog(". Payload %d produced stderr: No (%s does not exist)" % (_i + 1, _stderr))
    else:
        filename = "%s/%s" % (job.workdir, job.stderr)
        if os.path.exists(filename):
            if os.path.getsize(filename) > 0:
                tolog(". Payload produced stderr   : Yes (check %s)" % (job.stderr))
            else:
                tolog(". Payload produced stderr   : No (empty %s)" % (job.stderr))
        else:
            tolog(". Payload produced stderr   : No (%s does not exist)" % (job.stderr))

    if foundCoreDump:
        tolog(". Found core dump in workdir: Yes")
    else:
        tolog(". Found core dump in workdir: No")

    if len(jobIds) > 1:
        tolog(". Executed multi-jobs       : %s" % str(jobIds))

    tolog(". Job was executed in dir   : %s" % job.workdir)
    tolog(". Error report produced at  : %s" % timeStamp())
    tolog("..Time report.................................................................................................")
    tolog(". CPU consumption time      : %s %s" % (str(job.cpuConsumptionTime), job.cpuConsumptionUnit))
    tolog(". Payload execution time    : %s s" % (str(job.timeExe)))
    tolog(". GetJob consumption time   : %s s" % (str(job.timeGetJob)))
    tolog(". Stage-in consumption time : %s s" % (str(job.timeStageIn)))
    tolog(". Stage-out consumption time: %s s" % (str(job.timeStageOut)))
    tolog("..............................................................................................................")

    # dump the pilot stderr if it exists
    fname = getPilotstderrFilename()
    if os.path.exists(fname):
        if os.path.getsize(fname) > 0:
            tolog("\n//begin %s ///////////////////////////////////////////////////////////////////////////" % os.path.basename(fname))
            dumpFile(fname, topilotlog=False)
            tolog("\n//end %s /////////////////////////////////////////////////////////////////////////////" % os.path.basename(fname))

    # dump the wrapper (RunJob) stderr if it exists
    fname = "%s/runjob.stderr" % (job.workdir)
    if os.path.exists(fname):
        if os.path.getsize(fname) > 0:
            tolog("\n//begin %s ///////////////////////////////////////////////////////////////////////////" % os.path.basename(fname))
            dumpFile(fname, topilotlog=True)
            tolog("\n//end %s /////////////////////////////////////////////////////////////////////////////" % os.path.basename(fname))

    if job.result[0] == 'finished' or job.result[0] == 'holding':
        if job.result[0] == 'holding':
            tolog("Note that the following line is a message to the Panda monitor only")
        tolog("!!FINISHED!!0!!Job successfully completed")

def safe_call(func, *args):
    """ Try-statement wrapper around function call with traceback info """

    status = False
    try:
        func(*args)
    except Exception, e:
        tolog("!!WARNING!!1111!! Exception in function %s: %s" % (e, func))
        tolog("Stack trace:")

        import traceback

        exc, msg, tb = sys.exc_info()
        traceback.print_tb(tb)
    else:
        status = True

    return status

def getDispatcherErrorDiag(ec):
    """ Get the corresponding error diag for the dispatcher """

    # dispatcher codes
    codes = {}
    codes[0] = 'Success'
    codes[10] = 'Connection timed out'
    codes[20] = 'Dispatcher has no jobs'
    codes[30] = 'Failed'
    codes[40] = 'Non secure'
    codes[50] = 'Invalid token'
    codes[60] = 'Invalid role'
    codes[255] = 'EC_Failed'

    pilotErrorDiag = codes.get(ec, 'GETJOB encountered error %d' % (ec))
    tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))

    return pilotErrorDiag

def getCopyprefixFromTo(copyprefix):
    """ extract from and to info from copyprefix """

    pfrom = ""
    pto = ""

    if copyprefix != "":
        if copyprefix.count("^") == 1:
            pfrom, pto = copyprefix.split("^")
        elif copyprefix.startswith("^") or copyprefix.count("^") > 1:
            tolog("!!WARNING!!2988!! copyprefix has wrong format (not pfrom^pto): %s" % (copyprefix))
        else:
            pfrom = copyprefix

    if pfrom == "":
        pfrom = "dummy"
    else:
        if pfrom.endswith('/'):
            pfrom = pfrom[:-1]
            tolog("Cut away trailing / from %s (see copyprefix[in])" % (pfrom))
    if pto == "":
        pto = "dummy"

    return pfrom, pto

def getCopyprefixLists(copyprefix):
    """ Get the copyprefix lists """

    pfrom, pto = getCopyprefixFromTo(copyprefix)
    if "," in pfrom:
        pfroms = pfrom.split(",")
    else:
        pfroms = [pfrom]
    if "," in pto:
        ptos = pto.split(",")
    else:
        ptos = [pto]

    return pfroms, ptos

def getCmtconfig(jobCmtconfig):
    """ Get the cmtconfig from the job def or schedconfig """

    # the job def should always contain the cmtconfig
    if jobCmtconfig != "" and jobCmtconfig != "None" and jobCmtconfig != "NULL":
        cmtconfig = jobCmtconfig
        tolog("Will try to use cmtconfig: %s (from job definition)" % (cmtconfig))
    else:
        cmtconfig = readpar('cmtconfig')
        tolog("Will try to use cmtconfig: %s (from schedconfig DB)" % (cmtconfig))

    return cmtconfig

def getCmtconfigAlternatives(cmtconfig, swbase):
    """ get a list of locally available cmtconfig's that can be used as alternatives to the primary cmtconfig in case it doesn't work """
    # prepend the default cmtconfig

    alternatives = [cmtconfig]

    from glob import glob

    # grab all files/directories in swbase dir
    dirList = glob(os.path.join(swbase, '*'))

    # use a hardcoded cmtconfigvalidation listfor now
    valid_names= ['i686-', 'x86_64-']

    # are there any dirs that have the cmtconfig pattern? (require at least three '-')
    pattern = re.compile('([A-Za-z0-9]+\-[A-Za-z0-9]+\-[A-Za-z0-9]+\-[A-Za-z0-9]+)')
    for directory in dirList:
        d = os.path.basename(directory)
        found = re.search(pattern, d)
        if found and d != cmtconfig:
            # make surethat weare notpickingup unvalid names (eg 'tags-BNL-Subcluster-4-BNL-ATLAS' would slip through otherwise)
            verified = False
            for valid_name in valid_names:
                # require that the found directory begins with i686- or x86_64-
                if d[:len(valid_name)] == valid_name:
                    verified = True
                    break
            if verified:
                alternatives.append(d)


    return alternatives

def extractFilePaths(s):
    """ Extract file paths from given setup string """

    # s = "source /path/setup.sh;export X509_USER_PROXY=/path/x509_up;source aa"
    # -> setup_path = ['/path/setup.sh', 'aa']

    setup_paths = None

    if s != "" and "source " in s:
        setup_paths = []
        s = s.replace(";;", ";")

        # extract all occurances of "source " (with or without a trailing ;)

        # first try a pattern ending with a ;
        pattern = re.compile(r"source (\S+);")

        found = re.findall(pattern, s)
        if len(found) > 0:
            for i in range(len(found)):
                setup_paths.append(found[i])

                # remove the found pattern so not to disturb the remaining search
                s = s.replace("source %s" % (found[i]), "")

        # assume additional patterns with no trailing ;
        pattern = re.compile(r"source (\S+)")

        found = re.findall(pattern, s)
        if len(found) > 0:
            for i in range(len(found)):
                setup_paths.append(found[i])

    if setup_paths == None:
        return setup_paths

    # note: there is a special case if the first setup path contains an unevaluated environment variable, e.g.
    # s = "export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; source $ATLAS_LOCAL_ROOT_BASE/user/atlasLocalSetup.sh --quiet;"
    # -> setup_paths = ['$ATLAS_LOCAL_ROOT_BASE/user/atlasLocalSetup.sh']
    # in that case, grab the environment variable from s and replace it in setup_paths
    # -> env_variables = {'ATLAS_LOCAL_ROOT_BASE': '/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase'}
    # -> setup_paths = ['/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase/user/atlasLocalSetup.sh']

    pattern = re.compile(r"export (\S+)\=(\S+)")
    t = re.findall(pattern, s) # t = [('ATLAS_LOCAL_ROOT_BASE', '/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase')]
    if t != []:
        for i in range(len(setup_paths)):
            for match_pair in t:
                try:
                    e, v = match_pair
                    v = v.replace(";", "").strip()
                    setup_paths[i] = setup_paths[i].replace("$" + e, v).replace("${" + e + "}", v)
                except Exception, e:
                    tolog("WARNNING: Error happened when extracting setup path: %s" % (e))

    return setup_paths

def verifySetupCommand(error, _setup_str):
    """ Make sure the setup command exists """

    ec = 0
    pilotErrorDiag = ""

    # remove any '-signs
    _setup_str = _setup_str.replace("'", "")
    tolog("Will verify: %s" % (_setup_str))

    if _setup_str != "" and "source " in _setup_str:
        # first extract the file paths from the source command(s)
        setup_paths = extractFilePaths(_setup_str)

        # only run test if string begins with an "/"
        if setup_paths:
            # verify that the file paths actually exists
            for setup_path in setup_paths:
                if os.path.exists(setup_path):
                    tolog("File %s has been verified" % (setup_path))
                else:
                    pilotErrorDiag = "No such file or directory: %s" % (setup_path)
                    tolog('!!WARNING!!2991!! %s' % (pilotErrorDiag))
                    ec = error.ERR_NOSUCHFILE
                    break
        else:
            # nothing left to test
            pass
    else:
        tolog("Nothing to verify in setup: %s (either empty string or no source command)" % (_setup_str))

    return ec, pilotErrorDiag

def getProperTimeout(paths):
    """ Return a proper time-out depending on if CVMFS is used or not"""
    # paths can contain several paths and commands, just look for the presence of /cvmfs/

    if "/cvmfs/" in paths:
        _timeout = 1000
    else:
        _timeout = 500

        # other special cases
        if os.environ.has_key('FACTORYQUEUE'):
            if "BNL_ATLAS_RCF" in os.environ['FACTORYQUEUE']:
                _timeout = 1000

    return _timeout

def getPilotVersion(initdir):
    """ Load the pilot version string from file VERSION """

    version = "PICARD"
    try:
        f = open(os.path.join(initdir, "PILOTVERSION"), "r")
    except Exception, e:
        print "!!WARNING!!0000!! Could not read pilot version from file: %s" % (e )
    else:
        _version = f.read().strip()
        # remove trailing \n if present
        if "\n" in _version:
            _version = _version.replace("\n", "")
        # trivial check
        pattern = re.compile(r"[A-Z]+ [A-Za-z0-9.]+")
        v = re.findall(pattern, _version)
        if v == []:
            print "!!WARNING!!0000!! Not a valid version format: %s" % (version)
        else:
            version = _version

    return version

# Necessary to initiate pilot version at this point, after the function declaration
# Note: this cannot be done in environment.py due to cyclic dependence of pUtil module
env['version'] = getPilotVersion(env['pilot_initdir'])

def getExperiment(experiment):
    """ Return a reference to an experiment class """

    from ExperimentFactory import ExperimentFactory
    factory = ExperimentFactory()
    _exp = None

    try:
        experimentClass = factory.newExperiment(experiment)
    except Exception, e:
        tolog("!!WARNING!!1114!! Experiment factory threw an exception: %s" % (e))
    else:
        _exp = experimentClass()

    return _exp

def getSiteInformation(experiment):
    """ Return a reference to a site information class """

    # The SiteInformationFactory ensures that the returned object is a Singleton
    # Usage:
    #    _exp = getSiteInformation(readpar('experiment')) # or from pilot option
    #    if _exp:
    #        _exp.somemethod("Hello")
    #    else:
    #        tolog("!!WARNING!!1111!! Failed to instantiate experiment class")

    from SiteInformationFactory import SiteInformationFactory
    factory = SiteInformationFactory()
    _exp = None

    try:
        siteInformationClass = factory.newSiteInformation(experiment)
    except Exception, e:
        tolog("!!WARNING!!1114!! SiteInformation factory threw an exception: %s" % (e))
    else:
        _exp = siteInformationClass()
        tolog("getSiteInformation: got experiment=%s" % (_exp.getExperiment()))

    return _exp

def dumpPilotInfo(version, pilot_version_tag, pilotId, jobSchedulerId, pilot_initdir, tofile=True):
    """ Pilot info """

    tolog("PanDA Pilot, version %s" % (version), tofile=tofile, essential=True)
    tolog("Version tag = %s" % (pilot_version_tag))
    tolog("PilotId = %s, jobSchedulerId = %s" % (str(pilotId), str(jobSchedulerId)), tofile=tofile)
    tolog("Current time: %s" % (timeStamp()), tofile=tofile)
    tolog("Run by Python %s" % (sys.version), tofile=tofile)
    tolog("%s bit OS" % (OSBitsCheck()), tofile=tofile)
    tolog("Pilot init dir: %s" % (pilot_initdir), tofile=tofile)
    if tofile:
        tolog("All output written to file: %s" % (getPilotlogFilename()))
    tolog("Pilot executed by: %s" % (commands.getoutput("whoami")), tofile=tofile)

def removePattern(_string, _pattern):
    """ Remove the regexp pattern from the given string """

    pattern = re.compile(r"%s" % (_pattern))

    found = re.findall(pattern, _string)
    if len(found) > 0:
        _substring = found[0]
        tolog("Found regexp string: %s" % (_substring))
        _string = _string.replace(_substring, "")

    return _string

def isPilotTCPServerAlive(server, port):
    """ Verify that the pilot TCP server is still alive """

    status = False

    import socket
    try:
        # open the socket
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    except Exception, e:
        tolog("!!WARNING!!2911!! Caught a socket/connect exception: %s" % (e))
    else:
        # try to commucate with the TCP server
        s.settimeout(10)
        try:
            s.connect((server, port))
        except Exception, e:
            tolog("!!WARNING!!1912!! Caught a socket/connect exception: %s" % (e))
        else:
            status = True
        s.settimeout(None)
        s.close()

    return status

def encode_string(_string):
    """ Encode a string using urlencode """

    from urllib import urlencode

    # truncate the string in case it is too long
    _string = _string[:1024]

    # put the string to be encoded in a dictionary
    encoded_dict = {"x":_string}

    # handle the =-sign (not possible to have since the decoder of the pilot TCP message does a splitting using =-signs)
    pre = urlencode(encoded_dict)
    encoded_string = pre.replace('x=', '^!^')

    return encoded_string

def decode_string(encoded_string):
    """ Decode a string using parse_qs """

    tolog("Decoding: %s" % (encoded_string))
    imported = False
    try:
        # on modern python, get the parse function from urlparse
        from urlparse import parse_qs
    except:
        pass
    else:
        imported = True

    if not imported:
        # on ancient python, get the parse function from cgi
        from cgi import parse_qs

    # handle the =-sign (put back)
    if '^!^' in encoded_string:
        encoded_string = encoded_string.replace('^!^', 'x=')

    decoded_string = ""
    try:
        decoded_dict = parse_qs(encoded_string)
    except Exception, e:
        tolog("!!WARNING!!1234!! Failed to parse URL encoded string: %s" % (encoded_string))
    else:
        if decoded_dict.has_key('x'):
            try:
                decoded_string = decoded_dict['x'][0]
            except Exception, e:
                tolog("!!WARNING!!1234!! Failed to decode URL encoded string: %s" % (encoded_string))
        else:
            tolog("Empty URL encoded string (Nothing to decode)")

        # get rid of useless info
        if decoded_string == "^!^":
            tolog("Resetting decoded string (TCP ping signal)")
            decoded_string = ""

    return decoded_string

def stripDQ2FromLFN(lfn):
    """ Remove any legacy __DQ2 part of an LFN """
    # E.g. LFN = AOD.505307._000001.pool.root.9__DQ2-1315236060
    # -> AOD.505307._000001.pool.root.9
    # This method assumes that the LFN contains the legacy __DQ2-<nr> substring

    pattern = "(\s*)\_\_DQ2\-[0-9]+"

    found = re.search(pattern, lfn)
    if found:
        try:
            __DQ2 = found.group(0)
        except Exception, e:
            tolog("!!WARNING!!1112!! Failed to identify legacy __DQ2 substring: %s" % (e))
        else:
            lfn = lfn.replace(__DQ2, "")

    return lfn

def fastCleanup(workdir, pilot_initdir, rmwkdir):
    """ Cleanup the site workdir """

    print "fastCleanup() called"

    # return to the pilot init dir, otherwise wrapper will not find curl.config
    chdir(pilot_initdir)

    if rmwkdir or rmwkdir == None:
        if os.path.exists(workdir):
            try:
                rc, rs = commands.getstatusoutput("rm -rf %s" % (workdir))
            except Exception, e:
                print "!!WARNING!!1999!! Could not remove site workdir: %s, %s" % (workdir, e)
            else:
                if rc == 0:
                    print "Removed site workdir: %s" % (workdir)
                else:
                    print "!!WARNING!!1999!! Could not remove site workdir: %s, %d, %s" % (workdir, rc, rs)
                    s = 3*60
                    max_attempts = 2
                    attempt = 0
                    while attempt < max_attempts:
                        print "Sleeping %d seconds before trying again (re-attempt %d/%d)" % (s, attempt+1, max_attempts)
                        time.sleep(s)
                        try:
                            rc, rs = commands.getstatusoutput("rm -rf %s" % (workdir))
                        except Exception, e:
                            print "!!WARNING!!1999!! Could not remove site workdir: %s, %s" % (workdir, e)
                        else:
                            if rc == 0:
                                print "Removed site workdir: %s" % (workdir)
                            else:
                                print "!!WARNING!!1999!! Could not remove site workdir: %s, %d, %s" % (workdir, rc, rs)
                                dir_list = os.listdir(workdir)
                                print str(dir_list)
                                for f in dir_list:
                                    if ".nfs" in f:
                                        fname = os.path.join(workdir, f)
                                        print "Found NFS lock file: %s" % (fname)
                                        cmd = "lsof %s" % (fname)
                                        print "Executing command: %s" % (cmd)
                                        out = commands.getoutput(cmd)
                                        print out

                                        pid = None
                                        pattern = re.compile('sh\s+([0-9]+)')
                                        lines = out.split('\n')
                                        for line in lines:
                                            _pid = pattern.search(line)
                                            if _pid:
                                                pid = _pid.group(1)
                                                break
                                        if pid:
                                            print "Attempting to kill pid=%s" % (pid)
                                            cmd = "kill -9 %s" % (pid)
                                            out = commands.getoutput(cmd)
                                            print out
                                cmd = 'ps -fwu %s' % (commands.getoutput("whoami"))
                                print "%s: %s" % (cmd + '\n', commands.getoutput(cmd))
                        attempt += 1
        else:
            print "Nothing to cleanup (site workdir does not exist: %s)" % (workdir)
    else:
        print "rmwkdir flag set to False - will not delete %s" % (workdir)

    # flush buffers
    sys.stdout.flush()
    sys.stderr.flush()

def getStdoutFilename(workdir, preliminary_stdout_filename):
    """ Return the proper stdout filename """
    # In the case of runGen/runAthena, the preliminary filename should be updated since stdout is redirected at some point
    # In the case there are *.log files present, they are of greater interest than the stdout file so the last updated
    # one will be chosen instead of the stdout (prod jobs)

    # look for *.log files
    from FileHandling import findLatestTRFLogFile
    filename = findLatestTRFLogFile(workdir)
    # fall back to old method identifying the stdout file name
    if filename == "":
        from glob import glob

        # look for redirected stdout
        _path = os.path.join(os.path.join(workdir, "workDir"), "tmp.stdout.*")
        tolog("path=%s"%(_path))
        path_list = glob(_path)
        if len(path_list) > 0:
            # there should only be one path
            tolog("Found redirected stdout: %s" % str(path_list))
            filename = path_list[0]
        else:
            filename = preliminary_stdout_filename

    tolog("Using stdout filename: %s" % (filename))
    return filename

def getStdoutDictionary(jobDic):
    """ Create a dictionary with the tails of all running payloads """

    stdout_dictionary = {}
    number_of_lines = 20 # tail -20 filename

    # loop over all parallel jobs
    # (after multitasking was removed from the pilot, there is actually only one job)
    for k in jobDic.keys():

        jobId = jobDic[k][1].jobId

        # abort if not debug mode, but save an empty entry in the dictionary
        if not jobDic[k][1].debug:
            stdout_dictionary[jobId] = ""
            continue

        # is this a multi-trf job?
        nJobs = jobDic[k][1].jobPars.count("\n") + 1
        for _i in range(nJobs):
            _stdout = jobDic[k][1].stdout
            if nJobs > 1:
                _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1))

            #filename = "%s/%s" % (jobDic[k][1].workdir, _stdout)
            # _stdout is the preliminary filename, but can be different, e.g. runGen/runAthena redirects stdout
            filename = getStdoutFilename(jobDic[k][1].workdir, _stdout)
            if os.path.exists(filename):
                try:
                    # get the tail
                    cmd = "tail -%d %s" % (number_of_lines, filename)
                    tolog("Executing command: %s" % (cmd))
                    stdout = commands.getoutput(cmd)
                except Exception, e:
                    tolog("!!WARNING!!1999!! Tail command threw an exception: %s" % (e))
                    stdout_dictionary[jobId] = "(no stdout, caught exception: %s)" % (e)
                else:
                    if stdout == "":
                        tolog("!!WARNING!!1999!! Tail revealed empty stdout for file %s" % (filename))
                        stdout_dictionary[jobId] = "(no stdout)"
                    else:
                        # add to tail dictionary
                        stdout_dictionary[jobId] = stdout

                        # also keep track of the path to the stdout so we can send it to a text indexer if required
                        index = "path-%s" % (jobId)
                        stdout_dictionary[index] = filename

                        tolog("Stored path=%s at index %s" % (stdout_dictionary[index], index))
                # add the number of lines (later this should always be sent)
                pattern = re.compile(r"(\d+) [\S+]")
                cmd = "wc -l %s" % (filename)
                try:
                    _nlines = commands.getoutput(cmd)
                except Exception, e:
                    pilotErrorDiag = "wc command threw an exception: %s" % (e)
                    tolog("!!WARNING!!1999!! %s" % (pilotErrorDiag))
                    nlines = pilotErrorDiag
                else:
                    try:
                        nlines = re.findall(pattern, _nlines)[0]
                    except Exception, e:
                        pilotErrorDiag = "re.findall threw an exception: %s" % (e)
                        tolog("!!WARNING!!1999!! %s" % (pilotErrorDiag))
                        nlines = pilotErrorDiag
                stdout_dictionary[jobId] += "\n[%s]" % (nlines)
            else:
                tolog("(Skipping tail of payload stdout file (%s) since it has not been created yet)" % (os.path.basename(filename)))
                stdout_dictionary[jobId] = "(stdout not available yet)"

    tolog("Returning tail stdout dictionary with %d entries" % len(stdout_dictionary.keys()))
    return stdout_dictionary

def getStagingRetry(staging):
    """ Return a proper stage-in/out retry option """

    if staging == "stage-in":
        _STAGINGRETRY = readpar("stageinretry")
        _stagingTries = env['stageinretry'] # default value (2)
    else:
        _STAGINGRETRY = readpar("stageoutretry")
        _stagingTries = env['stageoutretry'] # default value (2)

    if _STAGINGRETRY != "":
        try:
            _stagingTries = int(_STAGINGRETRY)
        except Exception, e:
            tolog("!!WARNING!!1113!! Problematic %s retry number: %s, %s" % (staging, _STAGINGRETRY, e))
        else:
            stagingTries = _stagingTries
            tolog("Updated %s retry number to %d" % (staging, stagingTries))
    else:
        stagingTries = _stagingTries
        tolog("Updated %s retry number to %d" % (staging, stagingTries))

    return stagingTries


def handleQueuedata(_queuename, _pshttpurl, error, thisSite, _jobrec, _experiment, forceDownload = False, forceDevpilot = False):
    """ handle the queuedata download and post-processing """

    tolog("Processing queuedata")

    # get the site information object
    si = getSiteInformation(_experiment)

    # get the experiment object
    thisExperiment = getExperiment(_experiment)

    # (re-)download the queuedata
    ec, hasQueuedata = si.getQueuedata(_queuename, forceDownload=forceDownload, url=_pshttpurl)
    if ec != 0:
        return ec, thisSite, _jobrec, hasQueuedata

    # Get the new queuedata file from AGIS (unless it already exists)
    #try:
    #    s = si.getNewQueuedata(_queuename, overwrite=False)
    #except Exception, e:
    #    tolog("!!WARNING!!1212!! Exception caught: %s" % (e))

    if hasQueuedata:
        # update queuedata and thisSite if necessary
        ec, _thisSite, _jobrec = si.postProcessQueuedata(_queuename, _pshttpurl, thisSite, _jobrec, forceDevpilot)
        if ec != 0:
            return error.ERR_GENERALERROR, thisSite, _jobrec, hasQueuedata
        else:
            thisSite = _thisSite

        # should the server or the pilot do the LFC registration?
        if readpar("lfcregister") == "server":
            tolog("File registration will be done by server")

            # special check for storm sites
            _copytool = readpar('copytool')
            _copytoolin = readpar('copytoolin')
            if _copytool == "storm" and _copytoolin == "":
                _copytool = "lcgcp2"
                _copytoolin = "storm"
                tolog("!!WARNING!!1112!! Found schedconfig misconfiguration: Forcing queuedata update for storm site: copytool=%s, copytoolin=%s" % (_copytool, _copytoolin))
                ec = si.replaceQueuedataField("copytool", _copytool)
                ec = si.replaceQueuedataField("copytoolin", _copytoolin)
        else:
            # since lfcregister is not set, make sure that copytool is not set to lcgcp2
            if readpar("copytool") == "lcgcp2" or readpar("copytool") == "lcg-cp2" and readpar('region') != 'US':
                tolog("!!FAILED!!1111!! Found schedconfig misconfiguration: Site cannot use copytool=lcgcp2 without lfcregister=server")
                return error.ERR_GENERALERROR, thisSite, _jobrec, hasQueuedata

            tolog("File catalog registration no longer supported by pilot")
            return error.ERR_GENERALERROR, thisSite, _jobrec, hasQueuedata

        # should the number of stage-in/out retries be updated?
        env['stageinretry'] = getStagingRetry("stage-in")
        env['stageoutretry'] = getStagingRetry("stage-out")

    # does the application directory exist?
    ec = thisExperiment.verifySwbase(readpar('appdir'))
    if ec != 0:
        return ec, thisSite, _jobrec, hasQueuedata

    # update experiment for Nordugrid
    global experiment
    if os.environ.has_key('Nordugrid_pilot'):
        experiment = "Nordugrid-ATLAS"

    # reset site.appdir
    thisSite.appdir = readpar('appdir')

    if readpar('glexec') == "True":
        env['glexec'] = 'True'
    elif readpar('glexec') == "test":
        env['glexec'] = 'test'
    else:
        env['glexec'] = 'False'

    return ec, thisSite, _jobrec, hasQueuedata

def postJobTask(job, thisSite, thisNode, experiment, jr=False, ra=0, stdout_tail=None, stdout_path=None):
    """
    update Panda server with output info (xml) and make/save the tarball of the job workdir,
    only for finished or failed jobs.
    jr = job recovery
    ra = recovery attempt
    """

    # create and instantiate the job log object
    from JobLog import JobLog
    joblog = JobLog()

    # create the log
    joblog.postJobTask(job, thisSite, experiment, thisNode, jr=jr, ra=ra, stdout_tail=stdout_tail, stdout_path=stdout_path)

def verifyRecoveryDir(recoveryDir):
    """
    make sure that the recovery directory actually exists
    """

    # does the recovery dir actually exists?
    if recoveryDir != "":
        if os.path.exists(recoveryDir):
            tolog("Recovery directory exists: %s" % (recoveryDir))
        else:
            tolog("!!WARNING!!1190!! Recovery directory does not exist: %s (will not be used)" % (recoveryDir))
            recoveryDir = ""

    return recoveryDir

def removeTestFiles(job_state_files, mode="default"):
    """
    temporary code for removing test files or standard job state files
    """
    # for mode="default", normal jobState-<jobId>.pickle files will be returned
    # for mode="test", jobState-<jobId>-test.pickle files will be returned

    new_job_state_files = []
    if mode == "default":
        for f in job_state_files:
            if not "-test.pickle" in f:
                new_job_state_files.append(f)
    else:
        for f in job_state_files:
            if "-test.pickle" in f:
                new_job_state_files.append(f)

    return new_job_state_files

def moveToExternal(workdir, recoveryDir):
    """
    move job state file(s), and remaining log/work dir(s) to an external dir for later recovery
    also updates the job state file with the new info
    """
    from JobState import JobState

    status = True

    # make sure the recovery directory actually exists
    recoveryDir = verifyRecoveryDir(recoveryDir)
    if recoveryDir == "":
        tolog("!!WARNING!!1190!! verifyRecoveryDir failed")
        return False

    tolog("Using workdir: %s, recoveryDir: %s" % (workdir, recoveryDir))
    JS = JobState()

    # grab all job state files from the workdir
    from glob import glob
    job_state_files = glob("%s/jobState-*.pickle" % (workdir))

    # purge any test job state files (testing for new job rec algorithm)
    job_state_files = removeTestFiles(job_state_files, mode="default")

    _n = len(job_state_files)
    tolog("Number of found jobState files: %d" % (_n))
    if _n == 0:
        return False

    for job_state_file in job_state_files:
        # read back all job info n order to update it with the new recovery dir info
        if JS.get(job_state_file):
            # decode the job state info
            _job, _site, _node, _recoveryAttempt = JS.decode()
            _basenameSiteWorkdir = os.path.basename(_site.workdir)
            _basenameJobWorkdir = os.path.basename(_job.workdir)
            _basenameJobDatadir = os.path.basename(_job.datadir)
            siteworkdir = _site.workdir

            # create the site work dir on the external disk
            externalDir = "%s/%s" % (recoveryDir, _basenameSiteWorkdir)
            tolog("Using external dir: %s" % (externalDir))
            # does the external dir already exist? (e.g. when $HOME is used)
            if os.path.exists(externalDir):
                tolog("External dir already exists")
            else:
                # group rw permission added as requested by LYON
                # ec, rv = commands.getstatusoutput("mkdir -m g+rw %s" % (externalDir))
                # 770 at the request of QMUL/Alessandra Forti?
                ec, rv = commands.getstatusoutput("mkdir -m 770 %s" % (externalDir))
                if ec != 0:
                    if rv.find("changing permissions") >= 0 and rv.find("Operation not permitted") >= 0:
                        tolog("!!WARNING!!1190!! Was not allowed to created recovery dir with g+rw")
                        if os.path.exists(externalDir):
                            tolog("!!WARNING!!1190!! Recovery dir was nevertheless created: %s (will continue)" % (externalDir))
                        else:
                            tolog("!!WARNING!!1190!! Could not create dir on external disk: %s" % (rv))
                            return False
                    else:
                        tolog("!!WARNING!!1190!! Could not create dir on external disk: %s" % (rv))
                        return False
                else:
                    tolog("Successfully created external dir with g+rw")

            logfile = os.path.join(_site.workdir, _job.logFile)
            logfile_copied = os.path.join(_site.workdir, "LOGFILECOPIED")
            logfile_registered = os.path.join(_site.workdir, "LOGFILEREGISTERED")
            metadatafile1 = "metadata-%s.xml" % (_job.jobId)
            metadatafile2 = "metadata-%s.xml.PAYLOAD" % (_job.jobId)

            from FileHandling import getExtension
            surlDictionary = os.path.join(_site.workdir, "surlDictionary-%s.%s" % (_job.jobId, getExtension()))

            moveDic = {"workdir" : _job.workdir, "datadir" : _job.datadir, "logfile" : logfile, "logfile_copied" : logfile_copied,
                       "logfile_registered" : logfile_registered, "metadata1" : metadatafile1,
                       "metadata2" : metadatafile2, "surlDictionary" : surlDictionary }
            tolog("Using moveDic: %s" % str(moveDic))
            failures = 0
            successes = 0
            for item in moveDic.keys():
                # does the item actually exists?
                # (e.g. the workdir should be tarred into the log and should not exist at this point)
                if os.path.exists(moveDic[item]):

                    # move the work dir to the external dir
                    ec, rv = commands.getstatusoutput("mv %s %s" % (moveDic[item], externalDir))
                    if ec != 0:
                        tolog("!!WARNING!!1190!! Could not move item (%s) to external dir (%s): %s" % (moveDic[item], externalDir, rv))
                        failures += 1
                    else:
                        tolog("Moved holding job item (%s) to external dir (%s)" % (moveDic[item], externalDir))
                        successes += 1

                        # set a new path for the item
                        if item == "workdir":
                            # probably the work dir has already been tarred
                            _job.workdir = os.path.join(recoveryDir, _basenameJobWorkdir)
                            tolog("Updated job workdir: %s" % (_job.workdir))
                        elif item == "datadir":
                            _job.datadir = os.path.join(externalDir, _basenameJobDatadir)
                            tolog("Updated job datadir: %s" % (_job.datadir))
                        else:
                            tolog("(Nothing to update in job state file for %s)" % (item))
                else:
                    # if the log is present, there will not be a workdir
                    tolog("Item does not exist: %s" % (moveDic[item]))

                    # set a new path for the item
                    if item == "workdir":
                        # probably the work dir has already been tarred
                        _job.workdir = os.path.join(recoveryDir, _basenameJobWorkdir)
                        tolog("Updated job workdir: %s" % (_job.workdir))

            # update the job state file with the new state information if any move above was successful
            if successes > 0:
                _site.workdir = externalDir
                tolog("Updated site workdir: %s" % (_site.workdir))
                _retjs = updateJobState(_job, _site, _node)
                if not _retjs:
                    tolog("!!WARNING!!1190!! Could not create job state file in external dir: %s" % (externalDir))
                    tolog("!!WARNING!!1190!! updateJobState failed at critical stage")
                    failures += 1
                else:
                    tolog("Created a new job state file in external dir")

                    # remove the LOCKFILE since it can disturb any future recovery
                    if os.path.exists("%s/LOCKFILE" % (siteworkdir)):
                        ec, rv = commands.getstatusoutput("rm %s/LOCKFILE" % (siteworkdir))
                        if ec != 0:
                            tolog("!!WARNING!!1190!! Could not remove LOCKFILE - can disturb future recovery")
                        else:
                            tolog("Removed LOCKFILE from work dir: %s" % (siteworkdir))

            if failures > 0:
                tolog("!!WARNING!!1190!! Since at least one move to the external disk failed, the original work area")
                tolog("!!WARNING!!1190!! will not be removed and should be picked up by a later pilot doing the recovery")
                status = False
            else:
                tolog("All files were successfully transferred to the external recovery area")
        else:
            tolog("!!WARNING!!1190!! Could not open job state file: %s" % (job_state_file))
            status = False

    return status


def cleanup(wd, initdir, wrflag, rmwkdir):
    """ cleanup function """

    tolog("Overall cleanup function is called")
    # collect any zombie processes
    wd.collectZombieJob(tn=10)
    tolog("Collected zombie processes")

    # get the current work dir
    wkdir = readStringFromFile(os.path.join(initdir, "CURRENT_SITEWORKDIR"))

    # is there an exit code?
    ec = readCodeFromFile(os.path.join(wkdir, "EXITCODE"))

    # is there a process id
    pid = readCodeFromFile(os.path.join(wkdir, "PROCESSID"))
    if pid != 0:
        tolog("Found process id %d in PROCESSID file, will now attempt to kill all of its subprocesses" % (pid))
        killProcesses(pid, os.getpgrp())

    if rmwkdir == None or rmwkdir == False:

        # in case of multi-jobs, the workdir will already be deleted
        if os.path.exists(wkdir):

            # does the job work dir contain a lock file for this job?
            if os.path.exists("%s/LOCKFILE" % (wkdir)):
                tolog("Lock file found: will not delete %s!" % (wkdir))
                lockfile = True
                try:
                    os.system("chmod -R g+w %s" % (initdir))
                except Exception, e:
                    tolog("Failed to chmod pilot init dir: %s" % e)
                    pass
                else:
                    tolog("Successfully changed permission on pilot init dir (for later pilots that may be run by different users)")
            else:
                lockfile = False

            # remove the work dir only when there are no job state files
            if not lockfile and rmwkdir == None:
                tolog("Attempting to remove the pilot workdir %s now!" % (wkdir))
                try:
                    chdir(initdir)
                    os.system("rm -rf %s" % (wkdir))
                except Exception, e:
                    tolog("!!WARNING!!1000!! Failed to remove pilot workdir: %s" % e)
                else:
                    setPilotlogFilename("%s/pilotlog-last.txt" % (initdir))
            else:
                if lockfile:
                    # check if the workdir+job state file should be moved to an external directory
                    # check queuedata for external recovery directory
                    recoveryDir = ""
                    try:
                        recoveryDir = readpar('recoverdir')
                    except:
                        pass
                    else:
                        if recoveryDir != "":
                            if not moveToExternal(wkdir, recoveryDir):
                                tolog("Will not cleanup work area since move to external area at least partially failed")
                            else:
                                # cleanup work dir unless user do not want to
                                if rmwkdir == None:
                                    tolog("Removing the pilot workdir %s now! " % (wkdir))
                                    try:
                                        chdir("/")
                                        os.system("rm -rf %s" % (wkdir))
                                    except Exception, e:
                                        tolog("!!WARNING!!1000!! Failed to remove pilot workdir: %s" % e)
                                try:
                                    os.system("chmod -R g+w %s" % (recoveryDir))
                                except Exception, e:
                                    tolog("Failed to chmod recovery dir: %s" % e)
                                    pass
                                else:
                                    tolog("Successfully changed permission on external recovery dir (for later pilots that may be run by different users)")

                if rmwkdir == False:
                    tolog("rmwkdir flag set to False - will not delete %s" % (wkdir))

        else:
            tolog("Work dir already deleted by multi-job loop: %s" % (wkdir))

    # always remove the workdir if the rmwkdir was set at the pilot launch
    elif rmwkdir:
        if os.path.exists(wkdir):
            tolog("Removing the pilot workdir %s now! " % (wkdir))
            try:
                chdir("/")
                os.system("rm -rf %s" % (wkdir))
            except Exception,e:
                tolog("!!WARNING!!1000!! Failed to remove pilot workdir: %s" % e)
        else:
            tolog("Work dir already deleted by multi-job loop: %s" % (wkdir))

    else:
        tolog("rmwkdir flag set to False - will not delete %s" % (wkdir))

    tolog("Pilot cleanup has finished")

    # wait for the stdout to catch up (otherwise the full log is cut off in the batch stdout dump)
    time.sleep(10)

    # return exit code to wrapper (or caller, runMain())
    if wrflag:
        tolog("Done, returning %d to wrapper" % (ec))
        # flush buffers
        sys.stdout.flush()
        sys.stderr.flush()
        return shellExitCode(ec)
    else:
        tolog("Done, using system exit to quit")
        # flush buffers
        sys.stdout.flush()
        sys.stderr.flush()
        os._exit(0) # need to call this to clean up the socket, thread etc resources

def shellExitCode(exitCode):
    """ Translate the pilot exit code to a proper exit code for the shell """

    # get error handler
    error = PilotErrors()

    # Error code translation dictionary
    # FORMAT: { pilot_error_code : [ shell_error_code, meaning ], .. }

    # Restricting user (pilot) exit codes to the range 64 - 113, as suggested by http://tldp.org/LDP/abs/html/exitcodes.html
    # Using exit code 137 for kill signal error codes (this actually means a hard kill signal 9, (128+9), 128+2 would mean CTRL+C)

    error_code_translation_dictionary = {
        -1                       : [64, "Site offline"],
        error.ERR_GENERALERROR   : [65, "General pilot error, consult batch log"],
        error.ERR_MKDIRWORKDIR   : [66, "Could not create directory"],
        error.ERR_NOSUCHFILE     : [67, "No such file or directory"],
        error.ERR_NOVOMSPROXY    : [68, "Voms proxy not valid"],
        error.ERR_NOPROXY        : [68, "Voms proxy not valid"],
        error.ERR_NOLOCALSPACE   : [69, "No space left on local disk"],
        error.ERR_PILOTEXC       : [70, "Exception caught by pilot"],
        error.ERR_QUEUEDATA      : [71, "Pilot could not download queuedata"],
        error.ERR_QUEUEDATANOTOK : [72, "Pilot found non-valid queuedata"],
        error.ERR_NOSOFTWAREDIR  : [73, "Software directory does not exist"],
        error.ERR_KILLSIGNAL     : [137, "General kill signal"], # Job terminated by unknown kill signal
        error.ERR_SIGTERM        : [143, "Job killed by signal: SIGTERM"], # 128+15
        error.ERR_SIGQUIT        : [131, "Job killed by signal: SIGQUIT"], # 128+3
        error.ERR_SIGSEGV        : [139, "Job killed by signal: SIGSEGV"], # 128+11
        error.ERR_SIGXCPU        : [158, "Job killed by signal: SIGXCPU"], # 128+30
        error.ERR_SIGUSR1        : [144, "Job killed by signal: SIGUSR1"], # 128+16
        error.ERR_SIGBUS         : [138, "Job killed by signal: SIGBUS"]   # 128+10
        }

    if error_code_translation_dictionary.has_key(exitCode):
        return error_code_translation_dictionary[exitCode][0] # Only return the shell exit code, not the error meaning
    elif exitCode != 0:
        tolog("!!WARNING!!1234!! No translation to shell exit code for error code %d" % (exitCode))
        return 1
    else:
        return 0

def updatePandaServer(job, xmlstr=None, spaceReport=False, log=None, ra=0, jr=False, stdout_tail="", stdout_path=""):
    """ Update the panda server with the latest job info """

    # create and instantiate the client object
    from PandaServerClient import PandaServerClient
    client = PandaServerClient(pilot_version = env['version'], pilot_version_tag = env['pilot_version_tag'],
                               pilot_initdir = env['pilot_initdir'], jobSchedulerId = env['jobSchedulerId'],
                               pilotId = env['pilotId'], updateServer = env['updateServerFlag'],
                               jobrec = env['jobrec'], pshttpurl = env['pshttpurl'])

    # update the panda server
    return client.updatePandaServer(job, env['thisSite'], env['workerNode'], env['psport'],
                                    xmlstr = xmlstr, spaceReport = spaceReport, log = log, ra = ra, jr = jr,
                                    useCoPilot = env['useCoPilot'], stdout_tail = stdout_tail, stdout_path = stdout_path)

def sig2exc(sig, frm):
    """ Signal handler """

    error = PilotErrors()

    errorText = "!!FAILED!!1999!! [pilot] Signal %s is caught in pilot parent process!" % str(sig)
    tolog(errorText)
    ec = error.ERR_KILLSIGNAL
    # send to stderr
    print >> sys.stderr, errorText

    # here add the kill function to kill all the real jobs processes
    for k in env['jobDic'].keys():
        tmp = env['jobDic'][k][1].result[0]
        if tmp != "finished" and tmp != "failed" and tmp != "holding":
            if sig == signal.SIGTERM:
                ec = error.ERR_SIGTERM
            elif sig == signal.SIGQUIT:
                ec = error.ERR_SIGQUIT
            elif sig == signal.SIGSEGV:
                ec = error.ERR_SIGSEGV
            elif sig == signal.SIGXCPU:
                ec = error.ERR_SIGXCPU
            elif sig == signal.SIGBUS:
                ec = error.ERR_SIGBUS
            elif sig == signal.SIGUSR1:
                ec = error.ERR_SIGUSR1
            else:
                ec = error.ERR_KILLSIGNAL

            env['jobDic'][k][1].result[0] = "failed"
            env['jobDic'][k][1].subStatus = 'pilot_killed'
            env['jobDic'][k][1].currentState = env['jobDic'][k][1].result[0]
            # do not overwrite any previous error
            if env['jobDic'][k][1].result[2] == 0:
                env['jobDic'][k][1].result[2] = ec
            if not env['logTransferred']:
                env['jobDic'][k][1].pilotErrorDiag = "Job killed by signal %s: Signal handler has set job result to FAILED, ec = %d" %\
                                              (str(sig), ec)
                logMsg = "!!FAILED!!1999!! %s\n%s" % (env['jobDic'][k][1].pilotErrorDiag, env['version'])
                tolog(logMsg)

                ret, retNode = updatePandaServer(env['jobDic'][k][1], log = logMsg)
                if ret == 0:
                    tolog("Successfully updated panda server at %s" % timeStamp())
            else:
                # log should have been transferred
                env['jobDic'][k][1].pilotErrorDiag = "Job killed by signal %s: Signal handler has set job result to FAILED, ec = %d" %\
                                              (str(sig), ec)
                logMsg = "!!FAILED!!1999!! %s\n%s" % (env['jobDic'][k][1].pilotErrorDiag, env['version'])
                tolog(logMsg)

            killProcesses(env['jobDic'][k][0], env['jobDic'][k][1].pgrp)
            # most of the time there is not enough time to build the log
            # postJobTask(env['jobDic'][k][1], globalSite, globalWorkNode, env['experiment'], jr=False)

    # touch a KILLED file which will be seen by the multi-job loop, to prevent further jobs from being started
    try:
        createLockFile(False, env['thisSite'].workdir, "KILLED")
        writeToFile(os.path.join(env['thisSite'].workdir, "EXITCODE"), str(ec))
    except Exception, e:
        tolog("!!WARNING!!2211!! Caught exception: %s" % (e))

    raise SystemError(sig) # this one will trigger the cleanup function to be called

def extractPattern(source, pattern):
    """ Extract 'pattern' from 'source' using a regular expression """

    # Examples
    # surl = "srm://dcache-se-atlas.desy.de/pnfs/desy.de/atlas/dq2/atlasdatadisk/rucio/mc12_8TeV/3e/51/NTUP_SMWZ.01355318._000001.root.1"
    # extractPattern(surl, r'\/rucio\/.+(\/[a-zA-Z0-9]{2}\/[a-zA-Z0-9]{2}\/)') -> "/3e/51/"
    # extractPattern(surl, r'\/rucio\/(.+)\/[a-zA-Z0-9]{2}\/[a-zA-Z0-9]{2}\/') -> "mc12_8TeV"

    extracted = ""
    _pattern = re.compile(pattern)
    _extracted = re.findall(_pattern, source)
    if _extracted != []:
        extracted = _extracted[0]

    return extracted

def getEventService(experiment):
    """ Return a reference to an EventService class """

    # The EventServiceFactory ensures that the returned object is a Singleton
    # Usage:
    #    _exp = getEventService(readpar('experiment')) # or from pilot option
    #    if _exp:
    #        _exp.somemethod("Hello")
    #    else:
    #        tolog("!!WARNING!!1111!! Failed to instantiate EventService class")

    from EventServiceFactory import EventServiceFactory
    factory = EventServiceFactory()
    _exp = None

    try:
        eventServiceClass = factory.newEventService(experiment)
    except Exception, e:
        tolog("!!WARNING!!1114!! EventService factory threw an exception: %s" % (e))
    else:
        _exp = eventServiceClass()

    return _exp

def isValidGUID(guid):
    """ Verify the GUID generated with uuidgen """

    status = False
    pattern = "[A-F0-9]{8}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{4}-[A-F0-9]{12}"

    m = re.search(pattern, guid.upper())
    if not m:
        tolog("!!WARNING!!2333!! GUID=\'%s\' does not follow pattern \'%s\'" % (guid, pattern))
    else:
        status = True

    return status

def getGUID():
    """ Return a GUID generated with uuidgen """

    guid = commands.getoutput('uuidgen 2> /dev/null')

    # Make sure that there was no problem piping to dev null, ie that the GUID is proper
    if not isValidGUID(guid):
        tolog("Trying uuidgen without pipe to /dev/null")
        guid = commands.getoutput('uuidgen')
    if not isValidGUID(guid):
        tolog("!!WARNING!!2233!! Failed to generate GUID")
        guid = ""

    return guid

def extractHPCInfo(infoStr):
    """ Extract HPC name from the info string """
    # Return: isHPCSite (True/False), HPC_name (string)
    # infoStr = "blabla HPC_Titan" -> True, "Titan"
    # infoStr = "blabla bla" -> False, None
    # The HPC name will be capitalized (titan -> Titan)
    name = None
    isHPCSite = False

    m = re.search('HPC\_([A-Za-z0-9]+)', infoStr)
    if m:
        name = m.group(1)
        name = name.capitalize()
        isHPCSite = True

    return isHPCSite, name

def getInitialDirs(path, n):
    """ Get the initial n sub directories in a given path """

    # E.g. path = "/cvmfs/atlas-nightlies.cern.ch/repo/sw/nightlies", n = 3
    # -> subpath = "/cvmfs/atlas-nightlies.cern.ch/repo"

    subpath = ""
    if path[0] == "/":
        s = path.split("/")
        if n <= len(s):
            subpath = "/"
            for i in range(1, n+1):
                subpath = os.path.join(subpath, s[i])
        else:
            subpath = path
    else:
        tolog("!!WARNING!!2211!! Not a path: %s" % (path))

    return subpath

def convert(data):
    """ Convert unicode data to utf-8 """

    # Dictionary:
    #   data = {u'Max': {u'maxRSS': 3664, u'maxSwap': 0, u'maxVMEM': 142260, u'maxPSS': 1288}, u'Avg': {u'avgVMEM': 94840, u'avgPSS': 850, u'avgRSS': 2430, u'avgSwap': 0}}
    # convert(data)
    #   {'Max': {'maxRSS': 3664, 'maxSwap': 0, 'maxVMEM': 142260, 'maxPSS': 1288}, 'Avg': {'avgVMEM': 94840, 'avgPSS': 850, 'avgRSS': 2430, 'avgSwap': 0}}
    # String:
    #   data = u'hello'
    # convert(data)
    #   'hello'
    # List:
    #   data = [u'1',u'2','3']
    # convert(data)
    #   ['1', '2', '3']

    import collections
    if isinstance(data, basestring):
        return str(data)
    elif isinstance(data, collections.Mapping):
        return dict(map(convert, data.iteritems()))
    elif isinstance(data, collections.Iterable):
        return type(data)(map(convert, data))
    else:
        return data

def dumpFile(filename, topilotlog=False):
    """ dump a given file to stdout or to pilotlog """

    if os.path.exists(filename):
        tolog("Dumping file: %s (size: %d)" % (filename, os.path.getsize(filename)))
        try:
            f = open(filename, "r")
        except IOError, e:
            tolog("!!WARNING!!4000!! Exception caught: %s" % (e))
        else:
            i = 0
            for line in f.readlines():
                i += 1
                line = line.rstrip()
                if topilotlog:
                    tolog("%s" % (line))
                else:
                    print "%s" % (line)
            f.close()
            tolog("Dumped %d lines from file %s" % (i, filename))
    else:
        tolog("!!WARNING!!4000!! %s does not exist" % (filename))

def tryint(x):
    """ Used by numbered string comparison (to protect against unexpected letters in version number) """

    try:
        return int(x)
    except ValueError:
        return x

def split_version(s):
    """ Splits version string into parts and converts parts into integers wherever possible.
    split_version("1.2.3") = (1,2,3)
    split_version("1.2.Nightly") = (1,2,"Nightly")
    """

    # Can also be used for sorting:
    # > names = ['YT4.11', '4.3', 'YT4.2', '4.10', 'PT2.19', 'PT2.9']
    # > sorted(names, key=splittedname)
    # ['4.3', '4.10', 'PT2.9', 'PT2.19', 'YT4.2', 'YT4.11']

    from re import split
    return tuple(tryint(x) for x in split('([^.]+)', s))

def isAGreaterOrEqualToB(A, B):
    """ Is numbered string A >= B?
    "1.2.3" > "1.2"  -- more digits
    "1.2.3" > "1.2.2"  -- rank based comparison
    "1.3.2" > "1.2.3"  -- rank based comparison
    "1.2.N" > "1.2.2"  -- nightlies checker, they are always grater
    """
    # > a="1.2.3"
    # > b="2.2.2"
    # > e.isAGreaterOrEqualToB(a,b)

    return split_version(A) >= split_version(B)

def recursive_overwrite(src, dest, ignore=None):
    if os.path.isdir(src):
        if not os.path.isdir(dest):
            os.makedirs(dest)
        files = os.listdir(src)
        if ignore is not None:
            ignored = ignore(src, files)
        else:
            ignored = set()
        for f in files:
            if f not in ignored:
                recursive_overwrite(os.path.join(src, f),
                                    os.path.join(dest, f),
                                    ignore)
    else:
        shutil.copyfile(src, dest)

def chunks(l, n):
    """
    Yield successive n-sized chunks from l.
    """
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

def merge_dictionaries(*dict_args):
    """
    Given any number of dicts, shallow copy and merge into a new dict.
    precedence goes to key value pairs in latter dicts
    """

    result = {}
    for dictionary in dict_args:
        result.update(dictionary)

    return result

def getPooFilenameFromJobPars(jobPars):
    """ Extract the @poo filename from the job parameters """

    filename = ""

    pattern = re.compile(r"\@(\S+)")
    found = re.findall(pattern, jobPars)
    if len(found) > 0:
        filename = found[0]

    return filename

def updateInputFileWithTURLs(jobPars, LFN_to_TURL_dictionary):
    """ Update the @poo input file list with TURLs """

    status = False

    # First try to get the @poo filename (which actually contains the full local path to the file)
    filename = getPooFilenameFromJobPars(jobPars)
    if filename != "":
        if os.path.exists(filename):
            try:
                f = open(filename, "r")
            except IOError, e:
                tolog("!!WARNING!!2997!! Caught exception: %s" % (e))
            else:
                # Read file
                lines = f.readlines()
                f.close()

                # Process file
                turls = []
                header = lines[0]
                for lfn in lines:
                    # Note: the 'lfn' is actually a local file path, but will end with an \n
                    lfn = os.path.basename(lfn)
                    try:
                        # Try to get the corresponding dictionary entry (assume there is a corresponding entry in the actual TURL dictionary)
                        if lfn.endswith('\n'):
                            lfn = lfn[:-1]
                        turl = LFN_to_TURL_dictionary[lfn]
                    except:
                        pass
                    else:
                        turls.append(turl)

                if turls != []:
                    # Overwrite the old file with updated TURL info
                    try:
                        f = open(filename, "w")
                    except IOError, e:
                        tolog("!!WARNING!!2997!! Caught exception: %s" % (e))
                    else:
                        # Write file
                        f.write(header)
                        for turl in turls:
                            f.write(turl + "\n")
                        # Process file
                        f.close()
                        status = True
                else:
                    tolog("!!WARNING!!2998!! Failed to extract TURLs (empty TURL list)")
        else:
            tolog("!!WARNING!!2342!! File not found: %s" % (filename))
    else:
        tolog("!!WARNING!!2343!! Found no @input filename in jobPars: %s" % (jobPars))