# Copyright (C) 2015-2016 Regents of the University of California # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import print_function from builtins import str from builtins import range from builtins import object import filecmp from abc import abstractmethod, ABCMeta from struct import pack, unpack from uuid import uuid4 from toil.job import Job from toil.fileStores import FileID from toil.fileStores.cachingFileStore import IllegalDeletionCacheError, CacheUnbalancedError, CachingFileStore from toil.test import ToilTest, needs_aws_ec2, needs_google, slow, travis_test from toil.leader import FailedJobsException from toil.jobStores.abstractJobStore import NoSuchFileException from toil.realtimeLogger import RealtimeLogger import collections import datetime import inspect import logging import os import random import signal import sys import time import pytest import subprocess # Python 3 compatibility imports from six.moves import xrange from future.utils import with_metaclass if sys.version_info[0] < 3: # Define a usable FileNotFoundError as will be raised by os.remove on a # nonexistent file. FileNotFoundError = OSError # Some tests take too long on the AWS jobstore and are unquitable for CI. They can be # be run during manual tests by setting this to False. testingIsAutomatic = True logger = logging.getLogger(__name__) class hidden(object): """ Hiding the abstract test classes from the Unittest loader so it can be inherited in different test suites for the different job stores. """ class AbstractFileStoreTest(with_metaclass(ABCMeta, ToilTest)): """ An abstract base class for testing the various general functions described in :class:toil.fileStores.abstractFileStore.AbstractFileStore """ # This is overwritten in the inheriting classs jobStoreType = None def _getTestJobStore(self): if self.jobStoreType == 'file': return self._getTestJobStorePath() elif self.jobStoreType == 'aws': return 'aws:%s:cache-tests-%s' % (self.awsRegion(), str(uuid4())) elif self.jobStoreType == 'google': projectID = os.getenv('TOIL_GOOGLE_PROJECTID') return 'google:%s:cache-tests-%s' % (projectID, str(uuid4())) else: raise RuntimeError('Illegal job store type.') def setUp(self): super(hidden.AbstractFileStoreTest, self).setUp() testDir = self._createTempDir() self.options = Job.Runner.getDefaultOptions(self._getTestJobStore()) self.options.logLevel = 'DEBUG' self.options.realTimeLogging = True self.options.workDir = testDir self.options.clean = 'always' self.options.logFile = os.path.join(testDir, 'logFile') @staticmethod def _uselessFunc(job): """ I do nothing. Don't judge me. """ return None # Sanity test @travis_test def testToilIsNotBroken(self): """ Runs a simple DAG to test if if any features other that caching were broken. """ A = Job.wrapJobFn(self._uselessFunc) B = Job.wrapJobFn(self._uselessFunc) C = Job.wrapJobFn(self._uselessFunc) D = Job.wrapJobFn(self._uselessFunc) A.addChild(B) A.addChild(C) B.addChild(D) C.addChild(D) Job.Runner.startToil(A, self.options) # Test filestore operations. This is a slightly less intense version of the cache specific # test `testReturnFileSizes` @slow def testFileStoreOperations(self): """ Write a couple of files to the jobstore. Delete a couple of them. Read back written and locally deleted files. """ workdir = self._createTempDir(purpose='nonLocalDir') F = Job.wrapJobFn(self._testFileStoreOperations, nonLocalDir=workdir, numIters=30, disk='2G') Job.Runner.startToil(F, self.options) @staticmethod def _testFileStoreOperations(job, nonLocalDir, numIters=100): """ Aux function for testFileStoreOperations Conduct numIters operations. """ work_dir = job.fileStore.getLocalTempDir() writtenFiles = {} # fsID: (size, isLocal) localFileIDs = set() # Add one file for the sake of having something in the job store writeFileSize = random.randint(0, 30) cls = hidden.AbstractNonCachingFileStoreTest fsId, _ = cls._writeFileToJobStore(job, isLocalFile=True, nonLocalDir=nonLocalDir, fileMB=writeFileSize) writtenFiles[fsId] = writeFileSize localFileIDs.add(list(writtenFiles.keys())[0]) i = 0 while i <= numIters: randVal = random.random() if randVal < 0.33: # Write writeFileSize = random.randint(0, 30) isLocalFile = True if random.random() <= 0.5 else False fsID, _ = cls._writeFileToJobStore(job, isLocalFile=isLocalFile, nonLocalDir=nonLocalDir, fileMB=writeFileSize) writtenFiles[fsID] = writeFileSize localFileIDs.add(fsID) else: if len(writtenFiles) == 0: continue else: fsID, rdelFileSize = random.choice(list(writtenFiles.items())) rdelRandVal = random.random() if randVal < 0.66: # Read mutable = True if random.random() <= 0.5 else False cache = True if random.random() <= 0.5 else False job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, str(uuid4())]), cache=cache, mutable=mutable) localFileIDs.add(fsID) else: # Delete if rdelRandVal <= 0.5: # Local Delete if fsID not in localFileIDs: continue job.fileStore.deleteLocalFile(fsID) else: # Global Delete job.fileStore.deleteGlobalFile(fsID) writtenFiles.pop(fsID) if fsID in localFileIDs: localFileIDs.remove(fsID) i += 1 @staticmethod def _writeFileToJobStore(job, isLocalFile, nonLocalDir=None, fileMB=1): """ This function creates a file and writes it to the jobstore. :param bool isLocalFile: Is the file local(T) or Non-Local(F)? :param str nonLocalDir: A dir to write the file to. If unspecified, a local directory is created. :param int fileMB: Size of the created file in MB """ if isLocalFile: work_dir = job.fileStore.getLocalTempDir() else: assert nonLocalDir is not None work_dir = nonLocalDir with open(os.path.join(work_dir, str(uuid4())), 'wb') as testFile: testFile.write(os.urandom(fileMB * 1024 * 1024)) return job.fileStore.writeGlobalFile(testFile.name), testFile class AbstractNonCachingFileStoreTest(with_metaclass(ABCMeta, AbstractFileStoreTest)): """ Abstract tests for the the various functions in :class:toil.fileStores.nonCachingFileStore.NonCachingFileStore. These tests are general enough that they can also be used for :class:toil.fileStores.CachingFileStore. """ def setUp(self): super(hidden.AbstractNonCachingFileStoreTest, self).setUp() self.options.disableCaching = True class AbstractCachingFileStoreTest(with_metaclass(ABCMeta, AbstractFileStoreTest)): """ Abstract tests for the the various cache-related functions in :class:toil.fileStores.cachingFileStore.CachingFileStore. """ def setUp(self): super(hidden.AbstractCachingFileStoreTest, self).setUp() self.options.disableCaching = False @slow def testExtremeCacheSetup(self): """ Try to create the cache with bad worker active and then have 10 child jobs try to run in the chain. This tests whether the cache is created properly even when the job crashes randomly. """ if testingIsAutomatic and self.jobStoreType != 'file': self.skipTest("To save time") self.options.retryCount = 20 self.options.badWorker = 0.5 self.options.badWorkerFailInterval = 0.1 for test in range(0, 20): E = Job.wrapJobFn(self._uselessFunc) F = Job.wrapJobFn(self._uselessFunc) jobs = {} for i in range(0, 10): jobs[i] = Job.wrapJobFn(self._uselessFunc) E.addChild(jobs[i]) jobs[i].addChild(F) Job.Runner.startToil(E, self.options) @slow def testCacheEvictionPartialEvict(self): """ Ensure the cache eviction happens as expected. Two files (20MB and 30MB) are written sequentially into the job store in separate jobs. The cache max is force set to 50MB. A Third Job requests 10MB of disk requiring eviction of the 1st file. Ensure that the behavior is as expected. """ self._testValidityOfCacheEvictTest() # Explicitly set clean to always so even the failed cases get cleaned (This will # overwrite the value set in setUp if it is ever changed in the future) self.options.clean = 'always' self._testCacheEviction(file1MB=20, file2MB=30, diskRequestMB=10) @slow def testCacheEvictionTotalEvict(self): """ Ensure the cache eviction happens as expected. Two files (20MB and 30MB) are written sequentially into the job store in separate jobs. The cache max is force set to 50MB. A Third Job requests 10MB of disk requiring eviction of the 1st file. Ensure that the behavior is as expected. """ self._testValidityOfCacheEvictTest() # Explicitly set clean to always so even the failed cases get cleaned (This will # overwrite the value set in setUp if it is ever changed in the future) self.options.clean = 'always' self._testCacheEviction(file1MB=20, file2MB=30, diskRequestMB=30) @slow def testCacheEvictionFailCase(self): """ Ensure the cache eviction happens as expected. Two files (20MB and 30MB) are written sequentially into the job store in separate jobs. The cache max is force set to 50MB. A Third Job requests 10MB of disk requiring eviction of the 1st file. Ensure that the behavior is as expected. """ self._testValidityOfCacheEvictTest() # Explicitly set clean to always so even the failed cases get cleaned (This will # overwrite the value set in setUp if it is ever changed in the future) self.options.clean = 'always' self._testCacheEviction(file1MB=20, file2MB=30, diskRequestMB=60) def _testValidityOfCacheEvictTest(self): # If the job store and cache are on the same file system, file # sizes are accounted for by the job store and are not reflected in # the cache hence this test is redundant (caching will be free). if not self.options.jobStore.startswith(('aws', 'google')): workDirDev = os.stat(self.options.workDir).st_dev jobStoreDev = os.stat(os.path.dirname(self.options.jobStore)).st_dev if workDirDev == jobStoreDev: self.skipTest('Job store and working directory are on the same filesystem.') def _testCacheEviction(self, file1MB, file2MB, diskRequestMB): """ Ensure the cache eviction happens as expected. Two files (20MB and 30MB) are written sequentially into the job store in separate jobs. The cache max is force set to 50MB. A Third Job requests either 10, 30 or 60MB -- requiring eviction of 1 file, both files, or results in an error due to lack of space, respectively. Ensure that the behavior is as expected. """ self.options.retryCount = 0 if diskRequestMB > 50: # This can be non int as it will never reach _probeJobReqs expectedResult = 'Fail' else: expectedResult = 50 - file1MB if diskRequestMB <= file1MB else 0 try: A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True, fileMB=file1MB) # Sleep for 1 second after writing the first file so that their ctimes are # guaranteed to be distinct for the purpose of this test. B = Job.wrapJobFn(self._sleepy, timeToSleep=1) C = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True, fileMB=file2MB) D = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=50, disk='0M') E = Job.wrapJobFn(self._uselessFunc, disk=''.join([str(diskRequestMB), 'M'])) # Set it to > 2GB such that the cleanup jobs don't die in the non-fail cases F = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=5000, disk='10M') G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, cached=expectedResult, disk='100M') A.addChild(B) B.addChild(C) C.addChild(D) D.addChild(E) E.addChild(F) F.addChild(G) Job.Runner.startToil(A, self.options) except FailedJobsException as err: with open(self.options.logFile) as f: logContents = f.read() if CacheUnbalancedError.message in logContents: self.assertEqual(expectedResult, 'Fail') else: self.fail('Toil did not raise the expected CacheUnbalancedError but failed for some other reason') @staticmethod def _writeFileToJobStoreWithAsserts(job, isLocalFile, nonLocalDir=None, fileMB=1, expectAsyncUpload=True): """ This function creates a file and writes it to the jobstore. :param bool isLocalFile: Is the file local(T) (i.e. in the file store managed temp dir) or Non-Local(F)? Non-local files should not be cached. :param str nonLocalDir: A dir to write the file to. If unspecified, a local directory is created. :param int fileMB: Size of the created file in MB :param bool expectAsyncUpload: Whether we expect the upload to hit the job store later(T) or immediately(F) """ cls = hidden.AbstractNonCachingFileStoreTest fsID, testFile = cls._writeFileToJobStore(job, isLocalFile, nonLocalDir, fileMB) actual = os.stat(testFile.name).st_nlink # If the caching is free, the job store must have hard links to # everything the file store has. expectJobStoreLink = job.fileStore.cachingIsFree() # How many links ought this file to have? expected = 1 if isLocalFile: # We expect a hard link into the cache and not a copy expected += 1 if expectJobStoreLink and not expectAsyncUpload: # We also expect a link in the job store expected += 1 assert actual == expected, 'Should have %d links. Got %d.' % (expected, actual) logger.info('Uploaded %s with %d links', fsID, actual) if not isLocalFile: # Make sure it isn't cached if we don't want it to be assert not job.fileStore.fileIsCached(fsID), "File uploaded from non-local-temp directory %s should not be cached" % nonLocalDir return fsID @staticmethod def _sleepy(job, timeToSleep): """ I'm waiting for prince charming... but only for timeToSleep seconds. :param int timeToSleep: Time in seconds """ time.sleep(timeToSleep) @staticmethod def _adjustCacheLimit(job, newTotalMB): """ This function tells the cache to adopt a new "total" value = newTotalMB, changing the maximum cache disk space allowed for the run. :param int newTotalMB: New total cache disk space limit in MB. """ # Convert to bytes and pass on to the actual cache job.fileStore.adjustCacheLimit(float(newTotalMB * 1024 * 1024)) @staticmethod def _probeJobReqs(job, total=None, cached=None, sigmaJob=None): """ Probes the cacheLockFile to ensure the values for total, disk and cache are as expected. Can also specify combinations of the requirements if desired. :param int total: Expected Total Space available for caching in MB. :param int cached: Expected Total size of files in the cache in MB. :param int sigmaJob: Expected sum of job requirements in MB. """ RealtimeLogger.info('Probing job requirements') valueDict = locals() assert (total or cached or sigmaJob) # Work out which function to call for which value toCall = {'total': job.fileStore.getCacheLimit, 'cached': job.fileStore.getCacheUsed, 'sigmaJob': job.fileStore.getCacheExtraJobSpace} for value in ('total', 'cached', 'sigmaJob'): # If the value wasn't provided, it is None and should be ignored if valueDict[value] is None: continue RealtimeLogger.info('Probing cache state: %s', value) expectedBytes = valueDict[value] * 1024 * 1024 cacheInfoBytes = toCall[value]() RealtimeLogger.info('Got %d for %s; expected %d', cacheInfoBytes, value, expectedBytes) assert cacheInfoBytes == expectedBytes, 'Testing %s: Expected ' % value + \ '%s but got %s.' % (expectedBytes, cacheInfoBytes) @slow def testAsyncWriteWithCaching(self): """ Ensure the Async Writing of files happens as expected. The first Job forcefully modifies the cache size to 1GB. The second asks for 1GB of disk and writes a 900MB file into cache then rewrites it to the job store triggering an async write since the two unique jobstore IDs point to the same local file. Also, the second write is not cached since the first was written to cache, and there "isn't enough space" to cache the second. Imediately assert that the second write isn't cached, and is being asynchronously written to the job store. Attempting to get the file from the jobstore should not fail. """ self.options.retryCount = 0 self.options.logLevel = 'DEBUG' A = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=1024, disk='1G') B = Job.wrapJobFn(self._doubleWriteFileToJobStore, fileMB=850, disk='900M') C = Job.wrapJobFn(self._readFromJobStoreWithoutAssertions, fsID=B.rv(), disk='1G') # Set it to > 2GB such that the cleanup jobs don't die. D = Job.wrapJobFn(self._adjustCacheLimit, newTotalMB=5000, disk='1G') A.addChild(B) B.addChild(C) C.addChild(D) Job.Runner.startToil(A, self.options) @staticmethod def _doubleWriteFileToJobStore(job, fileMB): """ Write a local file to job store, then write it again. The second should trigger an async write. :param job: job :param fileMB: File Size :return: Job store file ID for second written file """ job.fileStore.logToMaster('Double writing a file into job store') work_dir = job.fileStore.getLocalTempDir() with open(os.path.join(work_dir, str(uuid4())), 'wb') as testFile: testFile.write(os.urandom(fileMB * 1024 * 1024)) job.fileStore.logToMaster('Writing copy 1 and discarding ID') job.fileStore.writeGlobalFile(testFile.name) job.fileStore.logToMaster('Writing copy 2 and saving ID') fsID = job.fileStore.writeGlobalFile(testFile.name) job.fileStore.logToMaster('Copy 2 ID: {}'.format(fsID)) hidden.AbstractCachingFileStoreTest._readFromJobStoreWithoutAssertions(job, fsID) job.fileStore.logToMaster('Writing copy 3 and returning ID') return job.fileStore.writeGlobalFile(testFile.name) @staticmethod def _readFromJobStoreWithoutAssertions(job, fsID): """ Reads a file from the job store. That will be all, thank you. :param job: job :param fsID: Job store file ID for the read file :return: None """ job.fileStore.logToMaster('Reading the written file') job.fileStore.readGlobalFile(fsID) # writeGlobalFile tests @travis_test def testWriteNonLocalFileToJobStore(self): """ Write a file not in localTempDir to the job store. Such a file should not be cached. Ensure the file is not cached. """ workdir = self._createTempDir(purpose='nonLocalDir') A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=False, nonLocalDir=workdir) Job.Runner.startToil(A, self.options) @travis_test def testWriteLocalFileToJobStore(self): """ Write a file from the localTempDir to the job store. Such a file will be cached by default. Ensure the file is cached. """ A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True) Job.Runner.startToil(A, self.options) # readGlobalFile tests @travis_test def testReadCacheMissFileFromJobStoreWithoutCachingReadFile(self): """ Read a file from the file store that does not have a corresponding cached copy. Do not cache the read file. Ensure the number of links on the file are appropriate. """ self._testCacheMissFunction(cacheReadFile=False) @travis_test def testReadCacheMissFileFromJobStoreWithCachingReadFile(self): """ Read a file from the file store that does not have a corresponding cached copy. Cache the read file. Ensure the number of links on the file are appropriate. """ self._testCacheMissFunction(cacheReadFile=True) def _testCacheMissFunction(self, cacheReadFile): """ This is the function that actually does what the 2 cache miss functions want. :param cacheReadFile: Does the read file need to be cached(T) or not(F) """ workdir = self._createTempDir(purpose='nonLocalDir') A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=False, nonLocalDir=workdir) B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=False, cacheReadFile=cacheReadFile, fsID=A.rv()) A.addChild(B) Job.Runner.startToil(A, self.options) @staticmethod def _readFromJobStore(job, isCachedFile, cacheReadFile, fsID, isTest=True): """ Read a file from the filestore. If the file was cached, ensure it was hard linked correctly. If it wasn't, ensure it was put into cache. Note that we may see hard links when we don't expect it based on caching, because immutable reads from the FileJobStore can be fulfilled by hardlinks. We only do immutable reads. :param bool isCachedFile: Flag. Was the read file read from cache(T)? If so, we look for a hard link. :param bool cacheReadFile: Should the the file that is read be cached(T)? :param str fsID: job store file ID :param bool isTest: Is this being run as a test(T) or an accessory to another test(F)? """ work_dir = job.fileStore.getLocalTempDir() wantHardLink = False if isCachedFile: outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), mutable=False) wantHardLink = True else: if cacheReadFile: outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), cache=True, mutable=False) wantHardLink = True else: assert not job.fileStore.fileIsCached(fsID), "File mistakenly cached before read" outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), cache=False, mutable=False) assert not job.fileStore.fileIsCached(fsID), "File mistakenly cached after read" wantHardLink = False if isTest: actual = os.stat(outfile).st_nlink if wantHardLink: assert actual > 1, 'Should have multiple links for file that was %s and %s. Got %i.' % ('cached' if isCachedFile else 'not cached', 'saved' if cacheReadFile else 'not saved', actual) # We need to accept harf links even if we don't want them, # because we may get them straight from the FileJobStore since # we asked for immutable reads. return None else: return outfile @travis_test def testReadCachHitFileFromJobStore(self): """ Read a file from the file store that has a corresponding cached copy. Ensure the number of links on the file are appropriate. """ A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True) B = Job.wrapJobFn(self._readFromJobStore, isCachedFile=True, cacheReadFile=None, fsID=A.rv()) A.addChild(B) Job.Runner.startToil(A, self.options) @slow def testMultipleJobsReadSameCacheHitGlobalFile(self): """ Write a local file to the job store (hence adding a copy to cache), then have 10 jobs read it. Assert cached file size never goes up, assert unused job required disk space is always: (a multiple of job reqs) - (number of current file readers * filesize). At the end, assert the cache shows unused job-required space = 0. """ self._testMultipleJobsReadGlobalFileFunction(cacheHit=True) @slow def testMultipleJobsReadSameCacheMissGlobalFile(self): """ Write a non-local file to the job store(hence no cached copy), then have 10 jobs read it. Assert cached file size never goes up, assert unused job required disk space is always: (a multiple of job reqs) - (number of current file readers * filesize). At the end, assert the cache shows unused job-required space = 0. """ self._testMultipleJobsReadGlobalFileFunction(cacheHit=False) def _testMultipleJobsReadGlobalFileFunction(self, cacheHit): """ This function does what the two Multiple File reading tests want to do :param bool cacheHit: Is the test for the CacheHit case(T) or cacheMiss case(F) """ dirPurpose = 'tempWriteDir' if cacheHit else 'nonLocalDir' workdir = self._createTempDir(purpose=dirPurpose) with open(os.path.join(workdir, 'test'), 'w') as x: x.write(str(0)) A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=cacheHit, nonLocalDir=workdir, fileMB=256) B = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M') jobs = {} for i in range(0, 10): jobs[i] = Job.wrapJobFn(self._multipleFileReader, diskMB=1024, fsID=A.rv(), maxWriteFile=os.path.abspath(x.name), disk='1G', memory='10M', cores=1) A.addChild(jobs[i]) jobs[i].addChild(B) Job.Runner.startToil(A, self.options) with open(x.name, 'r') as y: assert int(y.read()) > 2 @staticmethod def _multipleFileReader(job, diskMB, fsID, maxWriteFile): """ Read a file from the job store immutable and explicitly ask to have it in the cache. If caching files is free, assert used cache space is zero, else assert it is equal to the read file. Also assert the sum job reqs + (number of readers of file * filesize) is and integer multiple of the disk requirements provided to this job. :param int diskMB: disk requirements provided to the job :param str fsID: job store file ID :param str maxWriteFile: path to file where the max number of concurrent readers of file will be written """ work_dir = job.fileStore.getLocalTempDir() outfile = job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, 'temp']), cache=True, mutable=False) diskBytes = diskMB * 1024 * 1024 fileStats = os.stat(outfile) fileSize = fileStats.st_size currentReaders = job.fileStore.getFileReaderCount(fsID) extraJobSpace = job.fileStore.getCacheExtraJobSpace() usedCache = job.fileStore.getCacheUsed() logger.info('Extra job space: %s', str(extraJobSpace)) logger.info('Current file readers: %s', str(currentReaders)) logger.info('File size: %s', str(fileSize)) logger.info('Job disk bytes: %s', str(diskBytes)) logger.info('Used cache: %s', str(usedCache)) with open(maxWriteFile, 'r+') as x: prev_max = int(x.read()) x.seek(0) x.truncate() x.write(str(max(prev_max, currentReaders))) if job.fileStore.cachingIsFree(): # No space should be used when caching is free assert usedCache == 0.0 else: # The right amount of space should be used otherwise assert usedCache == fileSize # Make sure that there's no over-usage of job requirements assert ((extraJobSpace + currentReaders * fileSize) % diskBytes) == 0.0 # Sleep so there's no race conditions where a job ends before another can get a hold of # the file time.sleep(3) @staticmethod def _writeExportGlobalFile(job): fileName = os.path.join(job.fileStore.getLocalTempDir(), 'testfile') with open(fileName, 'wb') as f: f.write(os.urandom(1024 * 30000)) # 30 Mb outputFile = os.path.join(job.fileStore.getLocalTempDir(), 'exportedFile') job.fileStore.exportFile(job.fileStore.writeGlobalFile(fileName), 'File://' + outputFile) if not filecmp.cmp(fileName, outputFile): logger.warning('Source file: %s', str(os.stat(fileName))) logger.warning('Destination file: %s', str(os.stat(outputFile))) raise RuntimeError("File {} did not properly get copied to {}".format(fileName, outputFile)) @slow def testFileStoreExportFile(self): # Tests that files written to job store can be immediately exported # motivated by https://github.com/BD2KGenomics/toil/issues/1469 root = Job.wrapJobFn(self._writeExportGlobalFile) Job.Runner.startToil(root, self.options) # Testing for the return of file sizes to the sigma job pool. @slow def testReturnFileSizes(self): """ Write a couple of files to the jobstore. Delete a couple of them. Read back written and locally deleted files. Ensure that after every step that the cache is in a valid state. """ workdir = self._createTempDir(purpose='nonLocalDir') F = Job.wrapJobFn(self._returnFileTestFn, jobDisk=2 * 1024 * 1024 * 1024, initialCachedSize=0, nonLocalDir=workdir, disk='2G') Job.Runner.startToil(F, self.options) @slow def testReturnFileSizesWithBadWorker(self): """ Write a couple of files to the jobstore. Delete a couple of them. Read back written and locally deleted files. Ensure that after every step that the cache is in a valid state. """ self.options.retryCount = 20 self.options.badWorker = 0.5 self.options.badWorkerFailInterval = 0.1 workdir = self._createTempDir(purpose='nonLocalDir') F = Job.wrapJobFn(self._returnFileTestFn, jobDisk=2 * 1024 * 1024 * 1024, initialCachedSize=0, nonLocalDir=workdir, numIters=30, disk='2G') Job.Runner.startToil(F, self.options) @staticmethod def _returnFileTestFn(job, jobDisk, initialCachedSize, nonLocalDir, numIters=100): """ Aux function for jobCacheTest.testReturnFileSizes Conduct numIters operations and ensure the cache has the right amount of data in it at all times. Track the cache calculations even thought they won't be used in filejobstore Assumes nothing is evicted from the cache. :param float jobDisk: The value of disk passed to this job. """ cached = initialCachedSize RealtimeLogger.info('Expecting %d bytes cached initially', cached) work_dir = job.fileStore.getLocalTempDir() writtenFiles = {} # fsID: (size, isLocal) localFileIDs = collections.defaultdict(list) # fsid: local/non-local/mutable/immutable # Add one file for the sake of having something in the job store writeFileSize = random.randint(0, 30) jobDisk -= writeFileSize * 1024 * 1024 # We keep jobDisk in sync with the amount of free space the job # still has that the file store doesn't know it has used. cls = hidden.AbstractCachingFileStoreTest fsId = cls._writeFileToJobStoreWithAsserts(job, isLocalFile=True, fileMB=writeFileSize) writtenFiles[fsId] = writeFileSize if job.fileStore.fileIsCached(list(writtenFiles.keys())[0]): cached += writeFileSize * 1024 * 1024 RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is cached', cached, writeFileSize) else: RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is not cached', cached, writeFileSize) localFileIDs[list(writtenFiles.keys())[0]].append('local') RealtimeLogger.info('Checking for %d bytes cached', cached) cls._requirementsConcur(job, jobDisk, cached) i = 0 while i <= numIters: randVal = random.random() if randVal < 0.33: # Write RealtimeLogger.info('Writing a file') writeFileSize = random.randint(0, 30) if random.random() <= 0.5: # Write a local file RealtimeLogger.info('Writing a local file of %d MB', writeFileSize) fsID = cls._writeFileToJobStoreWithAsserts(job, isLocalFile=True, fileMB=writeFileSize) writtenFiles[fsID] = writeFileSize localFileIDs[fsID].append('local') jobDisk -= writeFileSize * 1024 * 1024 if job.fileStore.fileIsCached(fsID): cached += writeFileSize * 1024 * 1024 RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is cached', cached, writeFileSize) else: RealtimeLogger.info('Expecting %d bytes cached because file of %d MB is not cached', cached, writeFileSize) else: # Write a non-local file RealtimeLogger.info('Writing a non-local file of %d MB', writeFileSize) fsID = cls._writeFileToJobStoreWithAsserts(job, isLocalFile=False, nonLocalDir=nonLocalDir, fileMB=writeFileSize) writtenFiles[fsID] = writeFileSize localFileIDs[fsID].append('non-local') # No change to the job since there was no caching RealtimeLogger.info('Checking for %d bytes cached', cached) cls._requirementsConcur(job, jobDisk, cached) else: if len(writtenFiles) == 0: continue else: fsID, rdelFileSize = random.choice(list(writtenFiles.items())) rdelRandVal = random.random() fileWasCached = job.fileStore.fileIsCached(fsID) if randVal < 0.66: # Read RealtimeLogger.info('Reading a file with size %d and previous cache status %s', rdelFileSize, str(fileWasCached)) if rdelRandVal <= 0.5: # Read as mutable, uncached RealtimeLogger.info('Reading as mutable and uncached; should still have %d bytes cached', cached) job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, str(uuid4())]), mutable=True, cache=False) localFileIDs[fsID].append('mutable') # No change because the file wasn't cached else: # Read as immutable RealtimeLogger.info('Reading as immutable and cacheable') job.fileStore.readGlobalFile(fsID, '/'.join([work_dir, str(uuid4())]), mutable=False, cache=True) localFileIDs[fsID].append('immutable') jobDisk -= rdelFileSize * 1024 * 1024 if not fileWasCached: if job.fileStore.fileIsCached(fsID): RealtimeLogger.info('File was not cached before and is now. Should have %d bytes cached', cached) cached += rdelFileSize * 1024 * 1024 else: RealtimeLogger.info('File was not cached before and still is not now. ' 'Should still have %d bytes cached', cached) else: RealtimeLogger.info('File was cached before. Should still have %d bytes cached', cached) cls._requirementsConcur(job, jobDisk, cached) else: # Delete if rdelRandVal <= 0.5: # Local Delete if fsID not in list(localFileIDs.keys()): continue RealtimeLogger.info('Deleting a file locally') job.fileStore.deleteLocalFile(fsID) else: # Global Delete RealtimeLogger.info('Deleting a file globally') job.fileStore.deleteGlobalFile(fsID) try: job.fileStore.readGlobalFile(fsID) except FileNotFoundError as err: pass except: raise RuntimeError('Got wrong error type for read of deleted file') else: raise RuntimeError('Able to read deleted file') writtenFiles.pop(fsID) if fsID in list(localFileIDs.keys()): for lFID in localFileIDs[fsID]: if lFID not in ('non-local', 'mutable'): jobDisk += rdelFileSize * 1024 * 1024 localFileIDs.pop(fsID) if fileWasCached: if not job.fileStore.fileIsCached(fsID): cached -= rdelFileSize * 1024 * 1024 RealtimeLogger.info('File was cached before and is not now. Should have %d bytes cached', cached) else: RealtimeLogger.info('File was cached before and still is cached now. ' 'Should still have %d bytes cached', cached) else: RealtimeLogger.info('File was not cached before deletion. Should still have %d bytes cached', cached) cls._requirementsConcur(job, jobDisk, cached) i += 1 return jobDisk, cached @staticmethod def _requirementsConcur(job, jobDisk, cached): """ Assert the values for job disk and total cached file sizes tracked by the file store are equal to the values we expect. """ used = job.fileStore.getCacheUsed() if not job.fileStore.cachingIsFree(): RealtimeLogger.info('Caching is not free; %d bytes are used and %d bytes are expected', used, cached) assert used == cached, 'Cache should have %d bytes used, but actually has %d bytes used' % (cached, used) else: RealtimeLogger.info('Caching is free; %d bytes are used and %d bytes would be expected if caching were not free', used, cached) assert used == 0, 'Cache should have nothing in it, but actually has %d bytes used' % used jobUnused = job.fileStore.getCacheUnusedJobRequirement() assert jobUnused == jobDisk, 'Job should have %d bytes of disk for non-FileStore use but the FileStore reports %d' % (jobDisk, jobUnused) # Testing the resumability of a failed worker @slow def testControlledFailedWorkerRetry(self): """ Conduct a couple of job store operations. Then die. Ensure that the restarted job is tracking values in the cache state file appropriately. """ workdir = self._createTempDir(purpose='nonLocalDir') self.options.retryCount = 1 jobDiskBytes = 2 * 1024 * 1024 * 1024 F = Job.wrapJobFn(self._controlledFailTestFn, jobDisk=jobDiskBytes, testDir=workdir, disk=jobDiskBytes) G = Job.wrapJobFn(self._probeJobReqs, sigmaJob=100, disk='100M') F.addChild(G) Job.Runner.startToil(F, self.options) @staticmethod def _controlledFailTestFn(job, jobDisk, testDir): """ This is the aux function for the controlled failed worker test. It does a couple of cache operations, fails, then checks whether the new worker starts with the expected value, and whether it computes cache statistics correctly. :param float jobDisk: Disk space supplied for this job :param str testDir: Testing directory """ # Make sure we actually have the disk size we are supposed to job.fileStore.logToMaster('Job is running with %d bytes of disk, %d requested' % (job.disk, jobDisk)) assert job.disk == jobDisk, 'Job was scheduled with %d bytes but requested %d' % (job.disk, jobDisk) cls = hidden.AbstractCachingFileStoreTest if os.path.exists(os.path.join(testDir, 'testfile.test')): with open(os.path.join(testDir, 'testfile.test'), 'rb') as fH: cached = unpack('d', fH.read())[0] RealtimeLogger.info('Loaded expected cache size of %d from testfile.test', cached) cls._requirementsConcur(job, jobDisk, cached) cls._returnFileTestFn(job, jobDisk, cached, testDir, 20) else: RealtimeLogger.info('Expecting cache size of 0 because testfile.test is absent') modifiedJobReqs, cached = cls._returnFileTestFn(job, jobDisk, 0, testDir, 20) with open(os.path.join(testDir, 'testfile.test'), 'wb') as fH: fH.write(pack('d', cached)) RealtimeLogger.info('Wrote cache size of %d to testfile.test', cached) os.kill(os.getpid(), signal.SIGKILL) @slow def testRemoveLocalMutablyReadFile(self): """ If a mutably read file is deleted by the user, it is ok. """ self._deleteLocallyReadFilesFn(readAsMutable=True) @slow def testRemoveLocalImmutablyReadFile(self): """ If an immutably read file is deleted by the user, it is not ok. """ self._deleteLocallyReadFilesFn(readAsMutable=False) def _deleteLocallyReadFilesFn(self, readAsMutable): self.options.retryCount = 0 A = Job.wrapJobFn(self._writeFileToJobStoreWithAsserts, isLocalFile=True, memory='10M') B = Job.wrapJobFn(self._removeReadFileFn, A.rv(), readAsMutable=readAsMutable, memory='20M') A.addChild(B) Job.Runner.startToil(A, self.options) @staticmethod def _removeReadFileFn(job, fileToDelete, readAsMutable): """ Accept a file. Run os.remove on it. Then attempt to delete it locally. This will raise an error for files read immutably. Then write a new file to the jobstore and try to do the same. This should always raise an error :param fileToDelete: File written to the job store that is tracked by the cache """ work_dir = job.fileStore.getLocalTempDir() # Are we processing the read file or the written file? processsingReadFile = True # Read in the file outfile = job.fileStore.readGlobalFile(fileToDelete, os.path.join(work_dir, 'temp'), mutable=readAsMutable) tempfile = os.path.join(work_dir, 'tmp.tmp') # The first time we run this loop, processsingReadFile is True and fileToDelete is the # file read from the job store. The second time, processsingReadFile is False and # fileToDelete is one that was just written in to the job store. Ensure the correct # behaviour is seen in both conditions. while True: os.rename(outfile, tempfile) try: job.fileStore.deleteLocalFile(fileToDelete) except IllegalDeletionCacheError: job.fileStore.logToMaster('Detected a deleted file %s.' % fileToDelete) os.rename(tempfile, outfile) else: # If we are processing the write test, or if we are testing the immutably read # file, we should not reach here. assert processsingReadFile and readAsMutable if processsingReadFile: processsingReadFile = False # Write a file with open(os.path.join(work_dir, str(uuid4())), 'wb') as testFile: testFile.write(os.urandom(1 * 1024 * 1024)) fileToDelete = job.fileStore.writeGlobalFile(testFile.name) outfile = testFile.name else: break @travis_test def testDeleteLocalFile(self): """ Test the deletion capabilities of deleteLocalFile """ self.options.retryCount = 0 workdir = self._createTempDir(purpose='nonLocalDir') A = Job.wrapJobFn(self._deleteLocalFileFn, nonLocalDir=workdir) Job.Runner.startToil(A, self.options) @staticmethod def _deleteLocalFileFn(job, nonLocalDir): """ Test deleteLocalFile on a local write, non-local write, read, mutable read, and bogus jobstore IDs. """ work_dir = job.fileStore.getLocalTempDir() # Write local file with open(os.path.join(work_dir, str(uuid4())), 'wb') as localFile: localFile.write(os.urandom(1 * 1024 * 1024)) localFsID = job.fileStore.writeGlobalFile(localFile.name) # write Non-Local File with open(os.path.join(nonLocalDir, str(uuid4())), 'wb') as nonLocalFile: nonLocalFile.write(os.urandom(1 * 1024 * 1024)) nonLocalFsID = job.fileStore.writeGlobalFile(nonLocalFile.name) # Delete fsid of local file. The file should be deleted job.fileStore.deleteLocalFile(localFsID) assert not os.path.exists(localFile.name) # Delete fsid of non-local file. The file should persist job.fileStore.deleteLocalFile(nonLocalFsID) assert os.path.exists(nonLocalFile.name) # Read back one file and then delete it readBackFile1 = job.fileStore.readGlobalFile(localFsID) job.fileStore.deleteLocalFile(localFsID) assert not os.path.exists(readBackFile1) # Read back one file with 2 different names and then delete it. Assert both get deleted readBackFile1 = job.fileStore.readGlobalFile(localFsID) readBackFile2 = job.fileStore.readGlobalFile(localFsID) job.fileStore.deleteLocalFile(localFsID) assert not os.path.exists(readBackFile1) assert not os.path.exists(readBackFile2) # Try to get a non-FileID that doesn't exist. try: job.fileStore.readGlobalFile('bogus') except NoSuchFileException: # TODO: We would like to require TypeError, but for Cactus # support we have to accept non-FileIDs. pass else: raise RuntimeError("Managed to get a file from a non-FileID") # Try to get a FileID for something that doesn't exist try: job.fileStore.readGlobalFile(FileID('bogus', 4096)) except NoSuchFileException: pass else: raise RuntimeError("Managed to read a non-existent file") @travis_test def testSimultaneousReadsUncachedStream(self): """ Test many simultaneous read attempts on a file created via a stream directly to the job store. """ self.options.retryCount = 0 self.options.disableChaining = True # Make a file parent = Job.wrapJobFn(self._createUncachedFileStream) # Now make a bunch of children fight over it for i in range(30): parent.addChildJobFn(self._readFileWithDelay, parent.rv()) Job.Runner.startToil(parent, self.options) @staticmethod def _createUncachedFileStream(job): """ Create and return a FileID for a non-cached file written via a stream. """ messageBytes = 'This is a test file\n'.encode('utf-8') with job.fileStore.jobStore.writeFileStream() as (out, idString): # Write directly to the job store so the caching file store doesn't even see it. # TODO: If we ever change how the caching file store does its IDs we will have to change this. out.write(messageBytes) # Now make a file ID fileID = FileID(idString, len(messageBytes)) return fileID @staticmethod def _readFileWithDelay(job, fileID, cores=0.1, memory=50 * 1024 * 1024, disk=50 * 1024 * 1024): """ Read a file from the CachingFileStore with a delay imposed on the download. Should create contention. Has low requirements so we can run a lot of copies at once. """ # Make sure the file store delays # Delay needs to be longer than the timeout for sqlite locking in the file store. job.fileStore.forceDownloadDelay = 120 readStart = datetime.datetime.now() logger.debug('Begin read at %s', str(readStart)) localPath = job.fileStore.readGlobalFile(fileID, cache=True, mutable=True) readEnd = datetime.datetime.now() logger.debug('End read at %s: took %f seconds', str(readEnd), (readEnd - readStart).total_seconds()) with open(localPath, 'rb') as fh: text = fh.read().decode('utf-8').strip() logger.debug('Got file contents: %s', text) class NonCachingFileStoreTestWithFileJobStore(hidden.AbstractNonCachingFileStoreTest): jobStoreType = 'file' @pytest.mark.timeout(1000) class CachingFileStoreTestWithFileJobStore(hidden.AbstractCachingFileStoreTest): jobStoreType = 'file' @needs_aws_ec2 class NonCachingFileStoreTestWithAwsJobStore(hidden.AbstractNonCachingFileStoreTest): jobStoreType = 'aws' @slow @needs_aws_ec2 @pytest.mark.timeout(1000) class CachingFileStoreTestWithAwsJobStore(hidden.AbstractCachingFileStoreTest): jobStoreType = 'aws' @needs_google class NonCachingFileStoreTestWithGoogleJobStore(hidden.AbstractNonCachingFileStoreTest): jobStoreType = 'google' @slow @needs_google @pytest.mark.timeout(1000) class CachingFileStoreTestWithGoogleJobStore(hidden.AbstractCachingFileStoreTest): jobStoreType = 'google' def _exportStaticMethodAsGlobalFunctions(cls): """ Define utility functions because Toil can't pickle static methods. Note that this relies on the convention that the first argument of a job function is named 'job'. """ for name, kind, clazz, value in inspect.classify_class_attrs(cls): if kind == 'static method' and name != '__new__': # __new__ became static in 3.7 method = value.__func__ args = inspect.getfullargspec(method).args if args and args[0] == 'job': globals()[name] = method _exportStaticMethodAsGlobalFunctions(hidden.AbstractFileStoreTest) _exportStaticMethodAsGlobalFunctions(hidden.AbstractCachingFileStoreTest) _exportStaticMethodAsGlobalFunctions(hidden.AbstractNonCachingFileStoreTest)