python source code of files

import glob
import json
import tempfile
import os
import os.path as osp

from bson import json_util
import shutil

import strax
from .common import StorageFrontend

export, __all__ = strax.exporter()


RUN_METADATA_PATTERN = '%s-metadata.json'


@export
class DataDirectory(StorageFrontend):
    """Simplest registry: single directory with FileStore data
    sitting in subdirectories.

    Run-level metadata is stored in loose json files in the directory.
    """

    can_define_runs = True
    provide_run_metadata = False

    def __init__(self, path='.', *args, deep_scan=False, **kwargs):
        """
        :param path: Path to folder with data subfolders.
        :param deep_scan: Let scan_runs scan over folders,
        so even data for which no run-level metadata is available
        is reported.

        For other arguments, see DataRegistry base class.
        """
        super().__init__(*args, **kwargs)
        self.backends = [strax.FileSytemBackend()]
        self.path = path
        self.deep_scan = deep_scan
        if not self.readonly and not osp.exists(self.path):
            os.makedirs(self.path)

    def _run_meta_path(self, run_id):
        return osp.join(self.path, RUN_METADATA_PATTERN % run_id)

    def run_metadata(self, run_id, projection=None):
        path = self._run_meta_path(run_id)
        if not osp.exists(path):
            raise strax.RunMetadataNotAvailable(
                f"No file at {path}, cannot find run metadata for {run_id}")
        with open(path, mode='r') as f:
            md = json.loads(f.read(),
                            object_hook=json_util.object_hook)
        md = strax.flatten_run_metadata(md)
        if projection is not None:
            md = {k: v
                  for k, v in md.items()
                  if k in projection}
        return md

    def write_run_metadata(self, run_id, metadata):
        with open(self._run_meta_path(run_id), mode='w') as f:
            f.write(json.dumps(metadata, default=json_util.default))

    def _scan_runs(self, store_fields):
        """Iterable of run document dictionaries.
        These should be directly convertable to a pandas DataFrame.
        """
        found = set()

        # Yield metadata for runs for which we actually have it
        for md_path in sorted(glob.glob(
                osp.join(self.path,
                         RUN_METADATA_PATTERN.replace('%s', '*')))):
            # Parse the run metadata filename pattern.
            # (different from the folder pattern)
            run_id = osp.basename(md_path).split('-')[0]
            found.add(run_id)
            yield self.run_metadata(run_id, projection=store_fields)

        if self.deep_scan:
            # Yield runs for which no metadata exists
            # we'll make "metadata" that consist only of the run name
            for fn in self._subfolders():
                run_id = self._parse_folder_name(fn)[0]
                if run_id not in found:
                    found.add(run_id)
                    yield dict(name=run_id)

    def _find(self, key, write,
              allow_incomplete, fuzzy_for, fuzzy_for_options):
        dirname = osp.join(self.path, str(key))
        exists = os.path.exists(dirname)
        bk = self.backend_key(dirname)

        if write:
            if exists and not self._can_overwrite(key):
                raise strax.DataExistsError(at=dirname)
            return bk

        if allow_incomplete and not exists:
            # Check for incomplete data (only exact matching for now)
            if fuzzy_for or fuzzy_for_options:
                raise NotImplementedError(
                    "Mixing of fuzzy matching and allow_incomplete "
                    "not supported by DataDirectory.")
            tempdirname = dirname + '_temp'
            bk = self.backend_key(tempdirname)
            if osp.exists(tempdirname):
                return bk

        # Check exact match
        if exists and self._folder_matches(dirname, key, None, None):
            return bk

        # Check metadata of all potentially matching data dirs for match...
        for fn in self._subfolders():
            if self._folder_matches(fn, key,
                                    fuzzy_for, fuzzy_for_options):
                return self.backend_key(fn)

        raise strax.DataNotAvailable

    def _subfolders(self):
        """Loop over subfolders of self.path that match our folder format"""
        if not os.path.exists(self.path):
            return
        for dirname in os.listdir(self.path):
            try:
                self._parse_folder_name(dirname)
            except InvalidFolderNameFormat:
                continue
            yield osp.join(self.path, dirname)

    @staticmethod
    def _parse_folder_name(fn):
        """Return (run_id, data_type, hash) if folder name matches
        DataDirectory convention, raise InvalidFolderNameFormat otherwise
        """
        stuff = osp.normpath(fn).split(os.sep)[-1].split('-')
        if len(stuff) != 3:
            # This is not a folder with strax data
            raise InvalidFolderNameFormat(fn)
        return stuff

    def _folder_matches(
            self, fn, key, fuzzy_for, fuzzy_for_options,
            ignore_name=False):
        """Return the run_id of folder fn if it matches key, or False if it
        does not
        :param name: Ignore the run name part of the key. Useful for listing
        availability
        """
        # Parse the folder name
        try:
            _run_id, _data_type, _hash = self._parse_folder_name(fn)
        except InvalidFolderNameFormat:
            return False

        # Check exact match
        if _data_type != key.data_type:
            return False
        if not ignore_name and _run_id != key.run_id:
            return False

        # Check fuzzy match
        if not (fuzzy_for or fuzzy_for_options):
            if _hash == key.lineage_hash:
                return _run_id
            return False
        metadata = self.backends[0].get_metadata(fn)
        if self._matches(metadata['lineage'], key.lineage,
                         fuzzy_for, fuzzy_for_options):
            return _run_id
        return False

    def backend_key(self, dirname):
        return self.backends[0].__class__.__name__, dirname

    def remove(self, key):
        # There is no database, so removing the folder from the filesystem
        # (which FileStore should do) is sufficient.
        pass


@export
def dirname_to_prefix(dirname):
    """Return filename prefix from dirname"""
    dirname = dirname.replace('_temp', '')
    return os.path.basename(dirname.strip('/').rstrip('\\')).split("-", maxsplit=1)[1]


@export
class FileSytemBackend(strax.StorageBackend):
    """Store data locally in a directory of binary files.

    Files are named after the chunk number (without extension).
    Metadata is stored in a file called metadata.json.
    """

    def get_metadata(self, dirname):
        prefix = dirname_to_prefix(dirname)
        metadata_json = f'{prefix}-metadata.json'
        md_path = osp.join(dirname, metadata_json)

        if not osp.exists(md_path):
            # Try old-format metadata
            # (if it's not there, just let it raise FileNotFound
            # with the usual message in the next stage)
            old_md_path = osp.join(dirname, 'metadata.json')
            if not osp.exists(old_md_path):
                raise strax.DataCorrupted(f"Data in {dirname} has no metadata")
            md_path = old_md_path

        with open(md_path, mode='r') as f:
            return json.loads(f.read())

    def _read_chunk(self, dirname, chunk_info, dtype, compressor):
        fn = osp.join(dirname, chunk_info['filename'])
        return strax.load_file(fn, dtype=dtype, compressor=compressor)

    def _saver(self, dirname, metadata):
        # Test if the parent directory is writeable.
        # We need abspath since the dir itself may not exist,
        # even though its parent-to-be does
        parent_dir = os.path.abspath(os.path.join(dirname, os.pardir))

        # In case the parent dir also doesn't exist, we have to create is
        # otherwise the write permission check below will certainly fail
        try:
            os.makedirs(parent_dir, exist_ok=True)
        except OSError as e:
            raise strax.DataNotAvailable(
                f"Can't write data to {dirname}, "
                f"{parent_dir} does not exist and we could not create it."
                f"Original error: {e}")

        # Finally, check if we have permission to create the new subdirectory
        # (which the Saver will do)
        if not os.access(parent_dir, os.W_OK):
            raise strax.DataNotAvailable(
                f"Can't write data to {dirname}, "
                f"no write permissions in {parent_dir}.")

        return FileSaver(dirname, metadata=metadata)


@export
class FileSaver(strax.Saver):
    """Saves data to compressed binary files"""
    json_options = dict(sort_keys=True, indent=4)

    def __init__(self, dirname, metadata):
        super().__init__(metadata)
        self.dirname = dirname
        self.tempdirname = dirname + '_temp'
        self.prefix = dirname_to_prefix(dirname)
        self.metadata_json = f'{self.prefix}-metadata.json'

        if os.path.exists(dirname):
            print(f"Removing data in {dirname} to overwrite")
            shutil.rmtree(dirname)
        if os.path.exists(self.tempdirname):
            print(f"Removing old incomplete data in {dirname}")
            shutil.rmtree(self.tempdirname)
        os.makedirs(self.tempdirname)
        self._flush_metadata()

    def _flush_metadata(self):
        with open(self.tempdirname + '/' + self.metadata_json, mode='w') as f:
            f.write(json.dumps(self.md, **self.json_options))

    def _chunk_filename(self, chunk_info):
        if 'filename' in chunk_info:
            return chunk_info['filename']
        ichunk = '%06d' % chunk_info['chunk_i']
        return f'{self.prefix}-{ichunk}'

    def _save_chunk(self, data, chunk_info, executor=None):
        filename = self._chunk_filename(chunk_info)

        fn = os.path.join(self.tempdirname, filename)
        kwargs = dict(data=data, compressor=self.md['compressor'])
        if executor is None:
            filesize = strax.save_file(fn, **kwargs)
            return dict(filename=filename, filesize=filesize), None
        else:
            return dict(filename=filename), executor.submit(
                strax.save_file, fn, **kwargs)

    def _save_chunk_metadata(self, chunk_info):
        is_first = chunk_info['chunk_i'] == 0
        if is_first:
            self.md['start'] = chunk_info['start']

        if self.is_forked:
            # Do not write to the main metadata file to avoid race conditions
            # Instead, write a separate metadata.json file for this chunk,
            # to be collected later.

            # We might not have a filename yet:
            # the chunk is not saved when it is empty
            filename = self._chunk_filename(chunk_info)

            fn = f'{self.tempdirname}/metadata_{filename}.json'
            with open(fn, mode='w') as f:
                f.write(json.dumps(chunk_info, **self.json_options))

        # To ensure we have some metadata to load with allow_incomplete,
        # modify the metadata immediately for the first chunk.
        # If we are forked, modifying self.md is harmless since
        # we're in a different process.

        if not self.is_forked or is_first:
            # Just append and flush the metadata
            # (maybe not super-efficient to write the json everytime...
            # just don't use thousands of chunks)
            # TODO: maybe make option to turn this off?
            self.md['chunks'].append(chunk_info)
            self._flush_metadata()

    def _close(self):
        if not os.path.exists(self.tempdirname):
            raise RuntimeError(
                f"{self.tempdirname} was already renamed to {self.dirname}. "
                "Did you attemt to run two savers pointing to the same "
                "directory? Otherwise this could be a strange race "
                "condition or bug.")

        for fn in sorted(glob.glob(
                self.tempdirname + '/metadata_*.json')):
            with open(fn, mode='r') as f:
                self.md['chunks'].append(json.load(f))
            os.remove(fn)

        self._flush_metadata()

        os.rename(self.tempdirname, self.dirname)


@export
class InvalidFolderNameFormat(Exception):
    pass