python source code of sketchfab

import h5py
import time
import os
import logging
from scipy.sparse import coo_matrix, csr_matrix
import numpy as np

from implicit.datasets import _download


log = logging.getLogger("implicit")


URL = 'https://github.com/benfred/recommender_data/releases/download/v1.0/sketchfab.hdf5'


def get_sketchfab():
    """ Returns the sketchfab dataset, downloading locally if necessary.

    This dataset contains about 632K likes from 62K users on 28k items collected
    from the sketchfab website, as described here:
    http://blog.ethanrosenthal.com/2016/10/09/likes-out-guerilla-dataset/

    Returns a tuple of (items, users, likes) where likes is a CSR matrix """

    filename = os.path.join(_download.LOCAL_CACHE_DIR, "sketchfab.hdf5")
    if not os.path.isfile(filename):
        log.info("Downloading dataset to '%s'", filename)
        _download.download_file(URL, filename)
    else:
        log.info("Using cached dataset at '%s'", filename)

    with h5py.File(filename, 'r') as f:
        m = f.get('item_user_likes')
        plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr')))
        return np.array(f['item']), np.array(f['user']), plays


def generate_dataset(filename, outputfilename):
    """ Generates a hdf5 lastfm datasetfile from the raw datafiles found at:
    http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html

    You shouldn't have to run this yourself, and can instead just download the
    output using the 'get_lastfm' funciton./

    Note there are some invalid entries in this dataset, running
    this function will clean it up so pandas can read it:
    https://github.com/benfred/bens-blog-code/blob/master/distance-metrics/musicdata.py#L39
    """
    data = _read_dataframe(filename)
    _hfd5_from_dataframe(data, outputfilename)


def _read_dataframe(filename):
    """ Reads the original dataset PSV as a pandas dataframe """
    import pandas

    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    log.debug("reading data from %s", filename)
    data = pandas.read_csv(filename, delimiter='|', quotechar='\\')

    # map each artist and user to a unique numeric value
    data['uid'] = data['uid'].astype("category")
    data['mid'] = data['mid'].astype("category")

    # store as a CSR matrix
    log.debug("read data file in %s", time.time() - start)
    return data


def _hfd5_from_dataframe(data, outputfilename):
    items = data['mid'].cat.codes.copy()
    users = data['uid'].cat.codes.copy()
    values = np.ones(len(items)).astype(np.float32)

    # create a sparse matrix of all the item/users/likes
    likes = coo_matrix((values, (items, users))).astype(np.float32).tocsr()

    with h5py.File(outputfilename, "w") as f:
        g = f.create_group('item_user_likes')
        g.create_dataset("data", data=likes.data)
        g.create_dataset("indptr", data=likes.indptr)
        g.create_dataset("indices", data=likes.indices)

        dt = h5py.special_dtype(vlen=str)
        item = list(data['mid'].cat.categories)
        dset = f.create_dataset('item', (len(item),), dtype=dt)
        dset[:] = item

        user = list(data['uid'].cat.categories)
        dset = f.create_dataset('user', (len(user),), dtype=dt)
        dset[:] = user