import datetime import functools import multiprocessing as mp import os import pickle import re from keras.preprocessing import image import PIL.Image from . import exceptions from . import calc as ic pj = os.path.join ic_base_dir = 'imagecluster' def read_pk(filename): """Read pickled data from `filename`.""" with open(filename, 'rb') as fd: ret = pickle.load(fd) return ret def write_pk(obj, filename): """Write object `obj` pickled to `filename`.""" os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'wb') as fd: pickle.dump(obj, fd) def get_files(imagedir, ext='jpg|jpeg|bmp|png'): """Return all file names with extension matching the regex `ext` from dir `imagedir`. Parameters ---------- imagedir : str ext : str regex Returns ------- list list of file names """ rex = re.compile(r'^.*\.({})$'.format(ext), re.I) return [os.path.join(imagedir,base) for base in os.listdir(imagedir) if rex.match(base)] def exif_timestamp(filename): """Read timestamp from image in `filename` from EXIF tag. This will probably work for most JPG files, but not for PNG, for instance. Raises ------ exceptions.ICExifReadError Returns ------- float timestamp, seconds since Epoch """ # PIL lazy-loads the image data, so this open and _getexif() is fast. img = PIL.Image.open(filename) if ('exif' not in img.info.keys()) or (not hasattr(img, '_getexif')): raise exceptions.ICExifReadError(f"no EXIF data found in {filename}") # Avoid constucting the whole EXIF dict just to extract the DateTime field. # DateTime -> key 306 is in the EXIF standard, so let's use that directly. ## date_time = {TAGS[k] : v for k,v in exif.items()}['DateTime'] exif = img._getexif() key = 306 if key not in exif.keys(): raise exceptions.ICExifReadError(f"key 306 (DateTime) not found in " f"EXIF data of file {filename}") # '2019:03:10 22:42:42' date_time = exif[key] if date_time.count(':') != 4: msg = f"unsupported EXIF DateTime format in '{date_time}' of {filename}" raise exceptions.ICExifReadError(msg) # '2019:03:10 22:42:42' -> ['2019', '03', '10', '22', '42', '42'] date_time_str = date_time.replace(':', ' ').split() names = ('year', 'month', 'day', 'hour', 'minute', 'second') stamp = datetime.datetime(**{nn:int(vv) for nn,vv in zip(names,date_time_str)}, tzinfo=datetime.timezone.utc).timestamp() return stamp def stat_timestamp(filename): """File timestamp from file stats (mtime).""" return os.stat(filename).st_mtime def timestamp(filename, source='auto'): """Read single timestamp for image in `filename`. Parameters ---------- filename : str source : {'auto', 'stat', 'exif'} Read timestamps from file stats ('stat'), or EXIF tags ('exif'). If 'auto', then try 'exif' first. Returns ------- float timestamp, seconds since Epoch """ if source == 'auto': try: return exif_timestamp(filename) except exceptions.ICExifReadError: return stat_timestamp(filename) elif source == 'stat': return stat_timestamp(filename) elif source == 'exif': return exif_timestamp(filename) else: raise ValueError("source not in ['stat', 'exif', 'auto']") # TODO some code dups below, fix later by fancy factory functions # keras.preprocessing.image.load_img() uses img.rezize(shape) with the default # interpolation of Image.resize() which is pretty bad (see # imagecluster/play/pil_resample_methods.py). Given that we are restricted to # small inputs of 224x224 by the VGG network, we should do our best to keep as # much information from the original image as possible. This is a gut feeling, # untested. But given that model.predict() is 10x slower than PIL image loading # and resizing .. who cares. # # (224, 224, 3) ##img = image.load_img(filename, target_size=size) ##... = image.img_to_array(img) def _image_worker(filename, size): # Handle PIL error "OSError: broken data stream when reading image file". # See https://github.com/python-pillow/Pillow/issues/1510 . We have this # issue with smartphone panorama JPG files. But instead of bluntly setting # ImageFile.LOAD_TRUNCATED_IMAGES = True and hoping for the best (is the # image read, and till the end?), we catch the OSError thrown by PIL and # ignore the file completely. This is better than reading potentially # undefined data and process it. A more specialized exception from PILs # side would be good, but let's hope that an OSError doesn't cover too much # ground when reading data from disk :-) try: print(filename) img = PIL.Image.open(filename).convert('RGB').resize(size, resample=3) arr = image.img_to_array(img, dtype=int) return filename, arr except OSError as ex: print(f"skipping {filename}: {ex}") return filename, None def _timestamp_worker(filename, source): try: return filename, timestamp(filename, source) except OSError as ex: print(f"skipping {filename}: {ex}") return filename, None def read_images(imagedir, size, ncores=mp.cpu_count()): """Load images from `imagedir` and resize to `size`. Parameters ---------- imagedir : str size : sequence length 2 (width, height), used in ``Image.open(filename).resize(size)`` ncores : int run that many parallel processes Returns ------- dict {filename: 3d array (height, width, 3), ...} """ _f = functools.partial(_image_worker, size=size) with mp.Pool(ncores) as pool: ret = pool.map(_f, get_files(imagedir)) return {k: v for k,v in ret if v is not None} def read_timestamps(imagedir, source='auto', ncores=mp.cpu_count()): """Read timestamps of all images in `imagedir`. Parameters ---------- imagedir : str source : see :func:`~imagecluster.io.timestamp` ncores : int run that many parallel processes Returns ------- dict {filename: timestamp (int, seconds since Epoch)} """ _f = functools.partial(_timestamp_worker, source=source) with mp.Pool(ncores) as pool: ret = pool.map(_f, get_files(imagedir)) return {k: v for k,v in ret if v is not None} # TODO fingerprints and timestamps may have different images which have been # skipped -> we need a data struct to hold all image data and mask out the # skipped ones. For now we have a check in calc.cluster() def get_image_data(imagedir, model_kwds=dict(layer='fc2'), img_kwds=dict(size=(224,224)), timestamps_kwds=dict(source='auto'), pca_kwds=None): """Convenience function to create `images`, `fingerprints`, `timestamps`. It checks for existing `images` and `fingerprints` database files on disk and load them if present. Running this again only loads data from disk, which is fast. Default locations:: fingerprints: <imagedir>/imagecluster/fingerprints.pk images: <imagedir>/imagecluster/images.pk Parameters ---------- imagedir : str model_kwds : dict passed to :func:`~imagecluster.calc.get_model` img_kwds : dict passed to :func:`~imagecluster.io.read_images` timestamps_kwds : dict passed to :func:`~imagecluster.io.read_timestamps` pca_kwds : dict passed to :func:`~imagecluster.calc.pca`, PCA is skipped if ``pca_kwds=None`` Returns ------- images : see :func:`~imagecluster.io.read_images` fingerprints : see :func:`~imagecluster.calc.fingerprints` timestamps : see :func:`~imagecluster.io.read_timestamps` """ fingerprints_fn = pj(imagedir, ic_base_dir, 'fingerprints.pk') images_fn = pj(imagedir, ic_base_dir, 'images.pk') if os.path.exists(images_fn): print(f"reading image arrays {images_fn} ...") images = read_pk(images_fn) else: print(f"create image arrays {images_fn}") images = read_images(imagedir, **img_kwds) write_pk(images, images_fn) if os.path.exists(fingerprints_fn): print(f"reading fingerprints {fingerprints_fn} ...") fingerprints = read_pk(fingerprints_fn) else: print(f"create fingerprints {fingerprints_fn}") fingerprints = ic.fingerprints(images, ic.get_model(**model_kwds)) if pca_kwds is not None: fingerprints = ic.pca(fingerprints, **pca_kwds) write_pk(fingerprints, fingerprints_fn) print(f"reading timestamps ...") if timestamps_kwds is not None: timestamps = read_timestamps(imagedir, **timestamps_kwds) return images, fingerprints, timestamps