python source code of file

import logging
import functools
import os
import contextlib
import multiprocessing as mp
import multiprocessing.pool

import numpy as np

from buzzard._actors.message import Msg
from buzzard._actors.pool_job import MaxPrioJobWaiting, PoolJobWorking
from buzzard._gdal_file_raster import BackGDALFileRaster
from buzzard._tools import conv
from buzzard._footprint import Footprint

LOGGER = logging.getLogger(__name__)

class ActorFileChecker(object):
    """Actor that takes care of performing various checks on a cache file from a pool"""

    def __init__(self, raster):
        self._raster = raster
        self._back_ds = raster.back_ds
        self._alive = True
        io_pool = raster.io_pool
        if io_pool is not None:
            if isinstance(io_pool, mp.pool.ThreadPool):
                self._same_address_space = True
            elif isinstance(io_pool, mp.pool.Pool):
                self._same_address_space = False
            else: # pragma: no cover
                assert False, 'Type should be checked in facade'
            self._waiting_room_address = '/Pool{}/WaitingRoom'.format(id(io_pool))
            self._working_room_address = '/Pool{}/WorkingRoom'.format(id(io_pool))
        self._waiting_jobs = set()
        self._working_jobs = set()
        self.address = '/Raster{}/FileChecker'.format(self._raster.uid)

    @property
    def alive(self):
        return self._alive

    # ******************************************************************************************* **
    def receive_infer_cache_file_status(self, cache_fp, path):
        msgs = []

        if self._raster.io_pool is None:
            work = Work(self, cache_fp, path)
            status = work.func()
            msgs += [Msg(
                'CacheSupervisor', 'inferred_cache_file_status', cache_fp, path, status
            )]
        else:
            wait = Wait(self, cache_fp, path)
            self._waiting_jobs.add(wait)
            msgs += [Msg(self._waiting_room_address, 'schedule_job', wait)]

        return msgs

    def receive_token_to_working_room(self, job, token):
        self._waiting_jobs.remove(job)
        work = Work(self, job.cache_fp, job.path)
        self._working_jobs.add(work)
        return [
            Msg(self._working_room_address, 'launch_job_with_token', work, token)
        ]

    def receive_job_done(self, job, status):
        self._working_jobs.remove(job)
        return [
            Msg('CacheSupervisor', 'inferred_cache_file_status', job.cache_fp, job.path, status)
        ]

    def receive_die(self):
        assert self._alive
        self._alive = False
        msgs = []
        for job in self._waiting_jobs:
            msgs += [Msg(self._waiting_room_address, 'unschedule_job', job)]
        for job in self._working_jobs:
            msgs += [Msg(self._working_room_address, 'cancel_job', job)]
        self._waiting_jobs.clear()
        self._working_jobs.clear()
        self._raster = None
        self._back_ds = None
        return msgs

    # ******************************************************************************************* **

class Wait(MaxPrioJobWaiting):
    def __init__(self, actor, cache_fp, path):
        self.cache_fp = cache_fp
        self.path = path
        super().__init__(actor.address)

class Work(PoolJobWorking):
    def __init__(self, actor, cache_fp, path):
        self.cache_fp = cache_fp
        self.path = path
        if actor._raster.io_pool is None or actor._same_address_space:
            func = functools.partial(
                _cache_file_check,
                cache_fp, path, len(actor._raster), actor._raster.dtype,
                actor._back_ds
            )
        else:
            func = functools.partial(
                _cache_file_check,
                cache_fp, path, len(actor._raster), actor._raster.dtype,
                None,
            )
        actor._raster.debug_mngr.event('object_allocated', func)
        super().__init__(actor.address, func)

def _checksum(fname, buffer_size=512 * 1024, dtype='uint64'):
    # https://github.com/airware/buzzard/pull/39/#discussion_r239071556
    dtype = np.dtype(dtype)
    dtypesize = dtype.itemsize
    assert buffer_size % dtypesize == 0
    assert np.issubdtype(dtype, np.unsignedinteger)

    acc = dtype.type(0)
    with open(fname, "rb") as f:
        with np.warnings.catch_warnings():
            np.warnings.filterwarnings('ignore', r'overflow encountered')

            for chunk in iter(lambda: f.read(buffer_size), b""):
                head = np.frombuffer(chunk, dtype, count=len(chunk) // dtypesize)
                head = np.add.reduce(head, dtype=dtype, initial=acc)
                acc += head

                tailsize = len(chunk) % dtypesize
                if tailsize > 0:
                    # This should only be needed for file's tail
                    tail = chunk[-tailsize:] + b'\0' * (dtypesize - tailsize)
                    tail = np.frombuffer(tail, dtype)
                    acc += tail
        return '{:016x}'.format(acc.item())

def _cache_file_check(cache_fp, path, channel_count, dtype, back_ds_opt):
    checksum = path
    checksum = checksum.split('.')[-2]
    checksum = checksum.split('_')[-1]
    new_checksum = _checksum(path)
    if new_checksum != checksum:
        if back_ds_opt is not None:
            back_ds_opt.deactivate(path)
        LOGGER.warning('Removing {} because invalid checksum ({} instead of {})'.format(
            path, new_checksum, checksum,
        ))
        os.remove(path)
        return False

    allocator = lambda: BackGDALFileRaster.open_file(path, 'GTiff', [], 'r') # This may raise
    with contextlib.ExitStack() as stack:
        try:
            if back_ds_opt is None:
                gdal_ds = allocator()
            else:
                gdal_ds = stack.enter_context(back_ds_opt.acquire_driver_object(path, allocator))

            file_fp = Footprint(
                gt=gdal_ds.GetGeoTransform(),
                rsize=(gdal_ds.RasterXSize, gdal_ds.RasterYSize),
            )
            file_dtype = conv.dtype_of_gdt_downcast(gdal_ds.GetRasterBand(1).DataType)
            file_len = gdal_ds.RasterCount
            if file_fp != cache_fp: # pragma: no cover
                raise RuntimeError('invalid Footprint of {}({} instead of {})'.format(
                    path, file_fp, cache_fp
                ))
            if file_dtype != dtype: # pragma: no cover
                raise RuntimeError('invalid dtype of {}({} instead of {})'.format(
                    path, file_dtype, dtype
                ))
            if file_len != channel_count: # pragma: no cover
                raise RuntimeError('invalid channel_count of {}({} instead of {})'.format(
                    path, file_len, channel_count
                ))
            del gdal_ds
        except Exception:
            # Those exceptions should not trigger a cache file removal, because it might originate
            # from a mistake in the code that does not mean that those files are corrupted. For exemple:
            # - Maximum number of file descriptors reach
            # - Mismatch in cache directories path
            back_ds_opt.deactivate(path)
            raise

    return True