from vergeml.img import INPUT_PATTERNS, open_image, fixext, ImageType
from import source, SourcePlugin, Sample
from import Labels
from vergeml.utils import VergeMLError, xlink
from vergeml.option import option
import random
import numpy as np
from PIL import Image
import os.path
import json
from operator import methodcaller
import io
from copy import deepcopy

@source('labeled-image', descr="Load labeled images.")
@option('oversample', descr="Oversamples labels.", type=dict, yaml_only=True, default={})
class LabeledImageSource(SourcePlugin):
    input_patterns = INPUT_PATTERNS
    classes = None

    def __init__(self, config: dict={}):
        self.files = None
        self.oversample = deepcopy(config.get('oversample', dict()))

    def begin_read_samples(self):
        if self.files:

        classes_path = os.path.join(self.samples_dir, "classes.json")
        classes_are_directories = False

        if os.path.exists(classes_path):
            classes_are_directories = True

        if not self.meta['labels']:
            raise VergeMLError("No labels found.")

        self.files = self.scan_and_split_files(self._scan_dirs(classes_are_directories))

        if self.oversample:

            nfiles = {}

            for split, filenames in self.files.items():

                if split == 'test':
                    # don't augment test samples
                    nfiles['test'] = filenames

                nfiles[split] = []
                for filename, meta in filenames:
                    # nfile = self.normalize_filename(split, filename)
                    labels = self.classes["files"][filename]

                    nfiles[split].append((filename, meta))
                    for k, v in self.oversample.items():
                        if k in labels:
                            for _ in range(v-1):
                                nfiles[split].append((filename, meta))

            self.files = nfiles

    def num_samples(self, split: str) -> int:
        return len(self.files[split])

    def _get_classes_from_json(self):

        for filename in ("labels.txt", "classes.json"):
            path = os.path.join(self.samples_dir, filename)
            if not os.path.exists(path):
                raise VergeMLError("{} is missing".format(filename))

            with open(path) as f:
                if filename == "labels.txt":
                    items = filter(None, map(methodcaller("strip"),
                    labels = Labels(items)
                    self.classes = json.load(f)
        files = {}
        # prefix the sample with input_dir
        for k, v in self.classes['files'].items():

            # on windows and linux, separator is /
            path = k.split("/")
            path.insert(0, self.samples_dir)
            fname = os.path.join(*path)
            files[fname] = v

        self.classes['files'] = files
        self.meta['labels'] = labels

    def _get_classes_from_dirs(self):

        dirs = list(filter(None, (self.samples_dir, self.val_dir, self.test_dir)))

        # get label names from directories
        items = os.listdir(dirs[0])
        items = filter(lambda d: os.path.isdir(os.path.join(self.samples_dir, d)), items)
        items = filter(lambda d: not d.startswith("."), items)
        labels = Labels(sorted(items))

        self.classes = dict(files=dict())
        for label in labels:
            for dir in dirs:
                dir_label = os.path.join(dir, label)
                if os.path.exists(dir_label):
                    for root, _, filenames in os.walk(dir_label):
                        for file in filenames:
                            absfile = os.path.join(root, file)
                            self.classes["files"][absfile] = [label]
        self.meta['labels'] = labels

    def _scan_dirs(self, classes_are_directories):
        # when we are not restoring from cache, we only want to return the
        # files from classes.json
        if not classes_are_directories:
            res = dict(train=[], val=[], test=[])

            # classes.json might have a section where splits are set on a per
            # file basis. Use this to decide the split when it exists
            splits = self.classes.get('split', dict())
            for file in self.classes['files']:
                split = splits[file] if file in splits else 'train'

            # In case splits where determined by classes.json, turn off automatic
            # split
            if len(res['val']) or len(res['test']):
                # turn off by setting everything to None
                for k in ('val_num', 'val_perc', 'val_dir', 'test_num', 'test_perc', 'test_dir'):
                    setattr(self, k, None)

            #train, val, test = res['train'], res['val'], res['test']
            # otherwise, the superclass implementation will handle scanning
            # samples_dir
            train, val, test = super().scan_dirs()
            res = dict(train=train, val=val, test=test)

        return res['train'], res['val'], res['test']

    def read_samples(self, split, index, n=1):
        items = self.files[split][index:index+n]
        items = [(open_image(filename), filename, meta) for filename, meta in items]

        res = []
        for img, filename, meta in items:
            rng = random.Random(str(self.random_seed) + meta['filename'])
            y = Labels(self.classes["files"][filename])
            res.append(Sample(img, y, meta.copy(), rng))

        return res

    def hash(self, state) -> str:
        state = io.BytesIO(state.encode('utf8'))
        return super().hash(state.getvalue().decode('utf8') + self.hash_files(self.files))

    def transform(self, sample):
        onehot = np.array([float(label in sample.y) for label in self.meta['labels']])
        sample.x = np.asarray(sample.x)
        sample.y = onehot
        return sample

    def begin_preview(self, output_dir):
        # generate data dir
        data_dir = os.path.join(output_dir, ".data")
        if not os.path.exists(data_dir):

    def supports_preview(self):
        return True

    def write_preview(self, output_dir: str, split: str, sample: Sample):

        # make sure x and y have the right types
        if not isinstance(sample.x, ImageType):
            raise VergeMLError("Can't write sample with type: {}".format(type(sample.x)))

        if not isinstance(sample.y, Labels):
            raise VergeMLError("Can't write ground truth with type: {}".format(type(sample.y)))

        # get the right filename in .data to write the sample to
        data_dir = os.path.join(output_dir, ".data")
        name = fixext(os.path.basename(sample.meta['filename']), sample.x)
        path = self.preview_filename(os.path.join(data_dir, name))

        # create directories and hyperlinks so that split and label are visible in a file
        # manager
        for label in sample.y:
            link_dir = os.path.join(output_dir, split, label)
            if not os.path.exists(link_dir):
            link_path = self.preview_filename(os.path.join(link_dir, name))
            xlink(os.path.abspath(path), link_path)