# License: BSD 3 clause
import os
import tarfile

import numpy as np
import scipy
from sklearn.datasets import load_svmlight_file

from tick.dataset.download_helper import download_dataset, get_data_home

dataset_path = 'url/url_svmlight.tar.gz'
dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/url/url_svmlight.tar.gz'
_N_FEATURES = 3231961


def load_url_dataset_day(cache_path, days):
    """Loads url dataset from a tar file

    Parameters
    ----------
    cache_path : `str`
        Path to the tar file

    days : `list` or `range`
        Days to be loaded

    Returns
    -------
    X : `np.ndarray`
        A sparse matrix containing the features

    y : `np.ndarray`
        An array containing the labels
    """
    tar_file = tarfile.open(cache_path, "r:gz")

    X, y = None, None

    for day in days:
        data_filename = 'url_svmlight/Day{}.svm'.format(day)
        with tar_file.extractfile(data_filename) as data_file:
            X_day, y_day = load_svmlight_file(data_file,
                                              n_features=_N_FEATURES)

        if X is None:
            X, y = X_day, y_day
        else:
            X = scipy.sparse.vstack((X, X_day))
            y = np.hstack((y, y_day))

    return X, y


def download_url_dataset(data_home=None, verbose=False):
    """Downloads URL dataset and stores it locally

    Parameters
    ----------
    data_home : `str`, optional, default=None
        Specify a download and cache folder for the datasets. If None
        and not configured with TICK_DATASETS environement variable
        all tick datasets are stored in '~/tick_datasets' subfolders.

    verbose : `bool`, default=True
        If True, download progress bar will be printed

    Returns
    -------
    cache_path : `str`
        File path of the downloaded data
    """
    return download_dataset(dataset_url, dataset_path, data_home=data_home,
                            verbose=verbose)


def fetch_url_dataset(n_days=120, data_home=None, verbose=True):
    """Loads URL dataset

    Uses cache if this dataset has already been downloaded.

    Parameters
    ----------
    data_home : `str`, optional, default=None
        Specify a download and cache folder for the datasets. If None
        and not configured with TICK_DATASETS environement variable
        all tick datasets are stored in '~/tick_datasets' subfolders.

    verbose : `bool`, default=True
        If True, download progress bar will be printed

    Returns
    -------
    X : `np.ndarray`
        A sparse matrix containing the features

    y : `np.ndarray`
        An array containing the labels
    """
    data_home = get_data_home(data_home)
    cache_path = os.path.join(data_home, dataset_path)

    dataset = None
    if os.path.exists(cache_path):
        try:
            dataset = load_url_dataset_day(cache_path, range(n_days))
        except Exception as e:
            print(80 * '_')
            print('Cache loading failed')
            print(80 * '_')
            print(e)

    if dataset is None:
        download_url_dataset(data_home=data_home, verbose=verbose)
        dataset = load_url_dataset_day(cache_path, range(n_days))

    return dataset