# Copyright (C) 2019 Maxim Godzi, Anatoly Zaytsev, Dmitrii Kiselev # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. import pandas as pd import numpy as np import umap.umap_ as umap from collections import Counter from sklearn.manifold import TSNE from sklearn import decomposition from sklearn import manifold from sklearn.feature_extraction.text import TfidfVectorizer def _uni_counts_embedder(data, **kwargs): if 'index_col' not in kwargs: index_col = data.trajectory.retention_config['index_col'] else: index_col = kwargs['index_col'] if 'event_col' not in kwargs: event_col = data.trajectory.retention_config['event_col'] else: event_col = kwargs['event_col'] last_k = kwargs.get('last_k') if last_k is not None: data = data.groupby(index_col).tail(last_k) cv = data.groupby([index_col, event_col]).size().rename('event_count').reset_index() cv = cv.pivot(index=index_col, columns=event_col).fillna(0) cv.columns = cv.columns.levels[1] cv.columns.set_names(None, inplace=True) cv.index.set_names(None, inplace=True) setattr(cv.retention, 'datatype', 'features') return cv def _ngram_agg(x, ngram_range): res = [] shifts = [] for i in range(ngram_range[0] - 1, ngram_range[1]): shifts.append(x.shift(i)) res.extend(zip(*shifts)) return Counter(res) def counts_embedder(data, ngram_range=(1, 1), **kwargs): """ Calculate ``index_col`` embedding (continuous vector form) by counting ``event_col`` appearance for each ``index_col``. Parameters -------- data: pd.DataFrame Clickstream dataset. ngram_range: tuple, optional Range of ngrams to use in feature extraction. Default: ``(1, 1)`` index_col: str, optional Name of custom index column, for more information refer to ``init_config``. For instance, if in config you have defined ``index_col`` as ``user_id``, but want to use function over sessions. By default the column defined in ``init_config`` will be used as ``index_col``. event_col: str, optional Name of custom event column, for more information refer to ``init_config``. For instance, you may want to aggregate some events or rename and use it as new event column. By default the column defined in ``init_config`` will be used as ``event_col``. last_k: int, optional Include only the last ``last_k`` events for each ``index_col``. wo_last: int, optional Exclude ``last_k`` events for each ``index_col``. Returns -------- Vectorized dataframe with ``index_col`` as index and counts of ``event_col`` values as dataframe values. Return type ------- pd.DataFrame """ if max(ngram_range) == 1: return _uni_counts_embedder(data, **kwargs) if 'index_col' not in kwargs: index_col = data.trajectory.retention_config['index_col'] else: index_col = kwargs['index_col'] if 'event_col' not in kwargs: event_col = data.trajectory.retention_config['event_col'] else: event_col = kwargs['event_col'] last_k = kwargs.get('last_k') if last_k is not None: data = data.groupby(index_col).tail(last_k) wo_last = kwargs.get('wo_last_k') if wo_last is not None: bad_ids = data.groupby(index_col).tail(wo_last).index.values data = data[~data.index.isin(bad_ids)] cv = data.groupby(index_col)[event_col].apply(_ngram_agg, ngram_range=ngram_range).reset_index() cv = cv.pivot(index=index_col, columns='level_1', values=event_col).fillna(0) cv = cv.loc[:, [i for i in cv.columns if i[-1] == i[-1]]] cv.columns.set_names(None, inplace=True) cv.index.set_names(None, inplace=True) return cv def frequency_embedder(data, ngram_range=(1, 1), **kwargs): """ Similar to ``count_embedder()``, but normalizes events count over ``index_col``. Parameters ------- data: pd.DataFrame Clickstream dataset. ngram_range: tuple, optional Range of ngrams to use in feature extraction. Default: ``(1, 1)`` index_col: str, optional Name of custom index column, for more information refer to ``init_config``. For instance, if in config you have defined ``index_col`` as ``user_id``, but want to use function over sessions. By default the column defined in ``init_config`` will be used as ``index_col``. event_col: str, optional Name of custom event column, for more information refer to ``init_config``. For instance, you may want to aggregate some events or rename and use it as new event column. By default the column defined in ``init_config`` will be used as ``event_col``. Returns ------- Dataframe with sessions vectorized by frequencies of events. Return type ------- pd.DataFrame """ cv = counts_embedder(data, ngram_range, **kwargs) freq = pd.DataFrame( cv.values / cv.values.sum(1).reshape(-1, 1), index=cv.index.values, columns=cv.columns.values, ) setattr(freq.retention, 'datatype', 'features') return freq def tfidf_embedder(data, ngram_range=(1, 1), **kwargs): """ Similar to ``frequency_embedder()``, but normalizes event frequencies with inversed document frequency. Parameters -------- data: pd.DataFrame Clickstream dataset. ngram_range: tuple, optional Range of ngrams to use in feature extraction. Default: ``(1, 1)`` index_col: str, optional Name of custom index column, for more information refer to ``init_config``. For instance, if in config you have defined ``index_col`` as ``user_id``, but want to use function over sessions. By default the column defined in ``init_config`` will be used as ``index_col``. event_col: str, optional Name of custom event column, for more information refer to ``init_config``. For instance, you may want to aggregate some events or rename and use it as new event column. By default the column defined in ``init_config`` will be used as ``event_col``. Returns -------- Dataframe with ``index_col`` vectorized by TF-IDF of events. Return type ------- pd.DataFrame """ # print('range',ngram_range) if 'index_col' not in kwargs: index_col = data.trajectory.retention_config['index_col'] else: index_col = kwargs['index_col'] if 'event_col' not in kwargs: event_col = data.trajectory.retention_config['event_col'] else: event_col = kwargs['event_col'] corpus = data.groupby(index_col)[event_col].apply(lambda x: '~~'.join([el.lower() for el in x])) if 'vocab' in kwargs and kwargs['vocab'] is not None: vectorizer = TfidfVectorizer(vocabulary=kwargs['vocab'],token_pattern = '[^~]+',ngram_range = ngram_range) tfidf = pd.DataFrame(index=data[index_col].unique(), columns=kwargs['vocab'].keys(), data=vectorizer.fit_transform(corpus).todense()) else: vectorizer = TfidfVectorizer(ngram_range=ngram_range,token_pattern = '[^~]+').fit(corpus) cols = [dict_key[0] for dict_key in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])] tfidf = pd.DataFrame(index=data[index_col].unique(), columns=cols, data=vectorizer.transform(corpus).todense()) setattr(tfidf.retention, 'datatype', 'features') return tfidf def learn_tsne(data, **kwargs): """ Calculates TSNE transformation for given matrix features. Parameters -------- data: np.array Array of features. kwargs: optional Parameters for ``sklearn.manifold.TSNE()`` Returns ------- Calculated TSNE transform Return type ------- np.ndarray """ _tsne_filter = TSNE.get_params(TSNE) kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter} res = TSNE(random_state=0, **kwargs).fit_transform(data.values) return pd.DataFrame(res, index=data.index.values) def learn_umap(data, **kwargs): """ Calculates UMAP transformation for given matrix features. Parameters -------- data: np.array Array of features. kwargs: optional Parameters for ``umap.UMAP()`` Returns ------- Calculated UMAP transform Return type ------- np.ndarray """ #_tsne_filter = TSNE.get_params(TSNE) #kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter} #res = TSNE(random_state=0, **kwargs).fit_transform(data.values) reducer = umap.UMAP() _umap_filter = reducer.get_params() kwargs = {i: j for i, j in kwargs.items() if i in _umap_filter} embedding = umap.UMAP(random_state=0, min_dist = 1, **kwargs).fit_transform(data.values) return pd.DataFrame(embedding, index=data.index.values) def get_manifold(data, manifold_type, **kwargs): """ Reduces number of dimensions. Parameters --------- data: pd.DataFrame Dataframe with features for clustering indexed as in ``retention_config.index_col`` manifold_type: str Name dimensionality reduction method from ``sklearn.decomposition`` and ``sklearn.manifold`` kwargs: optional Parameters for ``sklearn.decomposition`` and ``sklearn.manifold`` methods. Returns -------- pd.DataFrame with reduced dimensions. Return type -------- pd.DataFrame """ if hasattr(decomposition, manifold_type): man = getattr(decomposition, manifold_type) elif hasattr(manifold, manifold_type): man = getattr(manifold, manifold_type) else: raise ValueError(f'There is not such manifold {manifold_type}') tsvd = man(**{i: j for i, j in kwargs.items() if i in man.get_params(man)}) res = tsvd.fit_transform(data) return pd.DataFrame(res, index=data.index) def merge_features(features, metadata, meta_index_col=None, manifold_type=None, fillna=None, drop=False, **kwargs): """ Adds metadata to TFIDF of trajectories. Eeduced if ``manifold_type`` is not ``None``. Parameters -------- features: pd.DataFrame Dataframe with users` metadata. metadata: pd.DataFrame Dataframe with user or session properties or any other information you would like to extract as features (e.g. user properties, LTV values, etc.). Default: ``None`` meta_index_col: str, optional Used when metadata is not ``None``. Name of column in ``metadata`` dataframe that contains the same ID as in ``index_col``, or if not defined, same as in retention_config (e.g ID of users or sessions). If ``None``, then index of metadata dataframe is used instead. Default: ``None`` manifold_type: str, optional Name dimensionality reduction method from ``sklearn.decomposition`` and ``sklearn.manifold``. Default: ``None`` fillna: optional Value for filling missing metadata for any ``index_col`` value. Default: ``None`` drop: bool, optional If ``True``, then drops users which do not exist in ``metadata`` dataframe. Default: ``False`` kwargs: optional Keyword arguments for ``sklearn.decomposition`` and ``sklearn.manifold`` methods. Returns ------- Dataframe with trajectory features (possibly reduced) and users metadata. Return type ------- pd.DataFrame """ if manifold_type is not None: features = get_manifold(features, manifold_type, **kwargs) if meta_index_col is not None: metadata.index = metadata[meta_index_col].values metadata = metadata.drop(meta_index_col, 1) res = features.join(metadata, rsuffix='_meta',) if drop and (fillna is None): res = res[res.isnull().sum(1) == 0].copy() if fillna is not None: res = res.fillna(fillna) return res def drop_equal_features(features, users, thres=0.1, **kwargs): """ Drop nonzero features with equal counts of negative and positive user's :param features: array of features :param users: list of positive users (e.g. data.retention.get_positive_users()) :param thres: threshold of dropping (e.g 0.1 means area of drop equals [0.9,1.1] :return: dropped features: array of features """ feat_group = features.groupby(features.index.isin(users)).agg(lambda x: (x > 0).sum()) feat_neg = feat_group.iloc[0] feat_pos = feat_group.iloc[1] feat_neg += 10**-6 feat_div = feat_pos / feat_neg - 1. features_drop = features.drop(features.columns[feat_div.abs() < thres], axis=1) return features_drop