from typing import List, Generator, Any, Tuple, Union import operator import random from vergeml.io import Sample from vergeml.utils import VergeMLError, SPLITS from copy import copy from vergeml.option import option, Option from vergeml.plugins import PLUGINS _OPERATION_META_KEY = '__vergeml_operation__' # TODO should all operations take the apply parameter? def operation(name, descr=None, long_descr=None, topic='general', apply=True): """Define an operation. :param name: Name of the operation. :param descr: A short description of the operation :param long_descr: A long description :param apply: If the preprocessing operation takes the apply parameter. """ def decorator(o): assert getattr(o, _OPERATION_META_KEY, None) is None options = Option.discover(o) cmd = Operation(name, descr=descr, long_descr=long_descr, apply=apply, options=options, topic=topic) setattr(o, _OPERATION_META_KEY, cmd) return o return decorator class Operation: def __init__(self, name, descr=None, long_descr=None, apply=True, topic='general', options=[], plugins=PLUGINS): self.name = name self.descr = descr self.long_descr = long_descr self.apply = apply self.options = options self.plugins = plugins self.topic = topic @staticmethod def discover(o, plugins=PLUGINS): res = None if hasattr(o, _OPERATION_META_KEY): res = getattr(o, _OPERATION_META_KEY) res.plugins = plugins for option in res.options: option.plugins = plugins return res class BaseOperation: """Base class for processing samples. This class can be used to augment, filter, transform and combine samples. To support this variety of use-cases, BaseOperation offers fine-grained control over how samples are being processed. When an operation changes the number of output samples, it must return the factor in multiplier(). """ def configuration(self): """Return the configuration of the BaseOperation instance. Since operations play a role in data processing, any change in a operation pipeline must result in a different hash value for a dataset. To capture the configuration of a preprocessing operation, BaseOperation defines this method, which can be overridden when needed. """ return self.__dict__ def process(self, sample: Sample, ops=List['BaseOperation']) -> Generator[Sample, None, None]: """Complete a processing step in the pipeline and run the next one. :param sample: The sample to be processed :param ops: The next operations to run :return: A generator yielding samples The process function is expected to first transform the sample and then to call the next BaseOperation, yielding the resulting sample as a return value. """ raise NotImplementedError def multiplier(self) -> float: """Return the factor by which the operation changes the number of output samples""" return 1.0 class OperationPlugin(BaseOperation): """Simplified Operations. Most operations will not need the raw power of BaseOperation, so an easier to use class is provided. In addition to simplified processing, it also offers the following functionality: - control which split the operation is applied to - TODO control which labels the operation is applied to - control if the operation is applied to the samples or the ground truth or both. - Automatic type checking so that an operation is only applied to supported sample data. """ # To implement type checking functionality, a subclass has to set this field to the type it # can handle. If it can handle multiple types, Union[Type1, Type2, ..] can be used type = Any # apply can be set from the operation configuration. it can be a comma separated string, a # tuple or None. apply = None def __init__(self, apply=None): """ :param apply: a tuple, a comma separated string or None possible values are a combination of: - x: apply the operation to the sample only - y: apply the operation to the label only - train: apply the operation only to the train split - val: apply the operation only to the val split - test: apply the operation only to the test split - all: apply the operation to all """ super().__init__() assert isinstance(apply, (list, tuple, str, type(None))) if apply == 'all': self.apply = set() elif isinstance(apply, (list, tuple)): self.apply = set(apply) elif isinstance(apply, str): self.apply = set(map(operator.methodcaller('strip'), apply.split(","))) else: self.apply = set() # CORRECT: # self.apply = {'train'} def transform(self, data: Any, rng: random.Random) -> Any: """Transform either x or y. :param data: the data to transform :param rng: the random generator :return: the transformed data transform() will pass in data, which can be either x or the y. When x and y of the same sample are processed with transform, the random number generator will be reset on the second call so that both pieces of data are processed with the same random numbers. """ raise NotImplementedError def transform_xy(self, x: Any, y: Any, rng: random.Random) -> Tuple[Any, Any]: """Transform x and y values. :param x: the sample data :param y: the ground truth :param rng: random generator :return: a tuple (x, y) When you need to transform x and y values in the same method, for example when they are images with different sizes and they need to be resized to have the same size, override this method. """ rngstate = rng.getstate() appxy = self.apply.intersection({'x', 'y'}) if not appxy or 'x' in self.apply: if _type_good(self.type, type(x)): rng.setstate(rngstate) x = self.transform(x, rng) if not appxy or 'y' in self.apply: if _type_good(self.type, type(y)): rng.setstate(rngstate) y = self.transform(y, rng) return x, y def transform_sample(self, sample: Sample) -> Sample: """Transform a sample. :param sample: the sample to transform :return: the transformed sample: When you don't care about the apply functionality and just want to process the sample, override this method. """ if self.apply.intersection(set(SPLITS)) \ and sample.meta['split'] not in self.apply: yield sample else: x, y = self.transform_xy(sample.x, sample.y, sample.rng) yield Sample(x, y, sample.meta, sample.rng) def process(self, sample: Sample, ops=List[BaseOperation]) -> Generator[Sample, None, None]: for s1 in self.transform_sample(sample): if not ops: yield s1 else: nextop, *rest = ops yield from nextop.process(s1, rest) # pylint: disable=E1101 def configuration(self): # CORRECT: # res = self.__dict__.copy() # if 'apply' in res: # res['apply'] = list(sorted(res['apply'])) return self.__dict__.copy() def _type_good(t1, t2): return t1 == Any or t1 == t2 or \ (getattr(t1, '__origin__', None) == Union and t2 in t1.__args__)