# encoding: utf-8 # Author: Vlad Niculae <vlad@vene.ro> # License: Simplified BSD import warnings from abc import ABCMeta, abstractmethod import numpy as np from sklearn.preprocessing import add_dummy_feature from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils.extmath import safe_sparse_dot, row_norms from sklearn.externals import six try: from sklearn.exceptions import NotFittedError except ImportError: class NotFittedError(ValueError, AttributeError): pass from lightning.impl.dataset_fast import get_dataset from .base import _BasePoly, _PolyClassifierMixin, _PolyRegressorMixin from .kernels import _poly_predict from .cd_direct_fast import _cd_direct_ho class _BaseFactorizationMachine(six.with_metaclass(ABCMeta, _BasePoly)): @abstractmethod def __init__(self, degree=2, loss='squared', n_components=2, alpha=1, beta=1, tol=1e-6, fit_lower='explicit', fit_linear=True, warm_start=False, init_lambdas='ones', max_iter=10000, verbose=False, random_state=None): self.degree = degree self.loss = loss self.n_components = n_components self.alpha = alpha self.beta = beta self.tol = tol self.fit_lower = fit_lower self.fit_linear = fit_linear self.warm_start = warm_start self.init_lambdas = init_lambdas self.max_iter = max_iter self.verbose = verbose self.random_state = random_state def _augment(self, X): # for factorization machines, we add a dummy column for each order. if self.fit_lower == 'augment': k = 2 if self.fit_linear else 1 for _ in range(self.degree - k): X = add_dummy_feature(X, value=1) return X def fit(self, X, y): """Fit factorization machine to training data. Parameters ---------- X : array-like or sparse, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : Estimator Returns self. """ if self.degree > 3: raise ValueError("FMs with degree >3 not yet supported.") X, y = self._check_X_y(X, y) X = self._augment(X) n_features = X.shape[1] # augmented X_col_norms = row_norms(X.T, squared=True) dataset = get_dataset(X, order="fortran") rng = check_random_state(self.random_state) loss_obj = self._get_loss(self.loss) if not (self.warm_start and hasattr(self, 'w_')): self.w_ = np.zeros(n_features, dtype=np.double) if self.fit_lower == 'explicit': n_orders = self.degree - 1 else: n_orders = 1 if not (self.warm_start and hasattr(self, 'P_')): self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features) if not (self.warm_start and hasattr(self, 'lams_')): if self.init_lambdas == 'ones': self.lams_ = np.ones(self.n_components) elif self.init_lambdas == 'random_signs': self.lams_ = np.sign(rng.randn(self.n_components)) else: raise ValueError("Lambdas must be initialized as ones " "(init_lambdas='ones') or as random " "+/- 1 (init_lambdas='random_signs').") y_pred = self._get_output(X) converged, self.n_iter_ = _cd_direct_ho( self.P_, self.w_, dataset, X_col_norms, y, y_pred, self.lams_, self.degree, self.alpha, self.beta, self.fit_linear, self.fit_lower == 'explicit', loss_obj, self.max_iter, self.tol, self.verbose) if not converged: warnings.warn("Objective did not converge. Increase max_iter.") return self def _get_output(self, X): y_pred = _poly_predict(X, self.P_[0, :, :], self.lams_, kernel='anova', degree=self.degree) if self.fit_linear: y_pred += safe_sparse_dot(X, self.w_) if self.fit_lower == 'explicit' and self.degree == 3: # degree cannot currently be > 3 y_pred += _poly_predict(X, self.P_[1, :, :], self.lams_, kernel='anova', degree=2) return y_pred def _predict(self, X): if not hasattr(self, "P_"): raise NotFittedError("Estimator not fitted.") X = check_array(X, accept_sparse='csc', dtype=np.double) X = self._augment(X) return self._get_output(X) class FactorizationMachineRegressor(_BaseFactorizationMachine, _PolyRegressorMixin): """Factorization machine for regression (with squared loss). Parameters ---------- degree : int >= 2, default: 2 Degree of the polynomial. Corresponds to the order of feature interactions captured by the model. Currently only supports degrees up to 3. n_components : int, default: 2 Number of basis vectors to learn, a.k.a. the dimension of the low-rank parametrization. alpha : float, default: 1 Regularization amount for linear term (if ``fit_linear=True``). beta : float, default: 1 Regularization amount for higher-order weights. tol : float, default: 1e-6 Tolerance for the stopping condition. fit_lower : {'explicit'|'augment'|None}, default: 'explicit' Whether and how to fit lower-order, non-homogeneous terms. - 'explicit': fits a separate P directly for each lower order. - 'augment': adds the required number of dummy columns (columns that are 1 everywhere) in order to capture lower-order terms. Adds ``degree - 2`` columns if ``fit_linear`` is true, or ``degree - 1`` columns otherwise, to account for the linear term. - None: only learns weights for the degree given. If ``degree == 3``, for example, the model will only have weights for third-order feature interactions. fit_linear : {True|False}, default: True Whether to fit an explicit linear term <w, x> to the model, using coordinate descent. If False, the model can still capture linear effects if ``fit_lower == 'augment'``. warm_start : boolean, optional, default: False Whether to use the existing solution, if available. Useful for computing regularization paths or pre-initializing the model. init_lambdas : {'ones'|'random_signs'}, default: 'ones' How to initialize the predictive weights of each learned basis. The lambdas are not trained; using alternate signs can theoretically improve performance if the kernel degree is even. The default value of 'ones' matches the original formulation of factorization machines (Rendle, 2010). To use custom values for the lambdas, ``warm_start`` may be used. max_iter : int, optional, default: 10000 Maximum number of passes over the dataset to perform. verbose : boolean, optional, default: False Whether to print debugging information. random_state : int seed, RandomState instance, or None (default) The seed of the pseudo random number generator to use for initializing the parameters. Attributes ---------- self.P_ : array, shape [n_orders, n_components, n_features] The learned basis functions. ``self.P_[0, :, :]`` is always available, and corresponds to interactions of order ``self.degree``. ``self.P_[i, :, :]`` for i > 0 corresponds to interactions of order ``self.degree - i``, available only if ``self.fit_lower='explicit'``. self.w_ : array, shape [n_features] The learned linear model, completing the FM. Only present if ``self.fit_linear`` is true. self.lams_ : array, shape [n_components] The predictive weights. References ---------- Polynomial Networks and Factorization Machines: New Insights and Efficient Training Algorithms. Mathieu Blondel, Masakazu Ishihata, Akinori Fujino, Naonori Ueda. In: Proceedings of ICML 2016. http://mblondel.org/publications/mblondel-icml2016.pdf Factorization machines. Steffen Rendle In: Proceedings of IEEE 2010. """ def __init__(self, degree=2, n_components=2, alpha=1, beta=1, tol=1e-6, fit_lower='explicit', fit_linear=True, warm_start=False, init_lambdas='ones', max_iter=10000, verbose=False, random_state=None): super(FactorizationMachineRegressor, self).__init__( degree, 'squared', n_components, alpha, beta, tol, fit_lower, fit_linear, warm_start, init_lambdas, max_iter, verbose, random_state) class FactorizationMachineClassifier(_BaseFactorizationMachine, _PolyClassifierMixin): """Factorization machine for classification. Parameters ---------- degree : int >= 2, default: 2 Degree of the polynomial. Corresponds to the order of feature interactions captured by the model. Currently only supports degrees up to 3. loss : {'logistic'|'squared_hinge'|'squared'}, default: 'squared_hinge' Which loss function to use. - logistic: L(y, p) = log(1 + exp(-yp)) - squared hinge: L(y, p) = max(1 - yp, 0)² - squared: L(y, p) = 0.5 * (y - p)² n_components : int, default: 2 Number of basis vectors to learn, a.k.a. the dimension of the low-rank parametrization. alpha : float, default: 1 Regularization amount for linear term (if ``fit_linear=True``). beta : float, default: 1 Regularization amount for higher-order weights. tol : float, default: 1e-6 Tolerance for the stopping condition. fit_lower : {'explicit'|'augment'|None}, default: 'explicit' Whether and how to fit lower-order, non-homogeneous terms. - 'explicit': fits a separate P directly for each lower order. - 'augment': adds the required number of dummy columns (columns that are 1 everywhere) in order to capture lower-order terms. Adds ``degree - 2`` columns if ``fit_linear`` is true, or ``degree - 1`` columns otherwise, to account for the linear term. - None: only learns weights for the degree given. If ``degree == 3``, for example, the model will only have weights for third-order feature interactions. fit_linear : {True|False}, default: True Whether to fit an explicit linear term <w, x> to the model, using coordinate descent. If False, the model can still capture linear effects if ``fit_lower == 'augment'``. warm_start : boolean, optional, default: False Whether to use the existing solution, if available. Useful for computing regularization paths or pre-initializing the model. init_lambdas : {'ones'|'random_signs'}, default: 'ones' How to initialize the predictive weights of each learned basis. The lambdas are not trained; using alternate signs can theoretically improve performance if the kernel degree is even. The default value of 'ones' matches the original formulation of factorization machines (Rendle, 2010). To use custom values for the lambdas, ``warm_start`` may be used. max_iter : int, optional, default: 10000 Maximum number of passes over the dataset to perform. verbose : boolean, optional, default: False Whether to print debugging information. random_state : int seed, RandomState instance, or None (default) The seed of the pseudo random number generator to use for initializing the parameters. Attributes ---------- self.P_ : array, shape [n_orders, n_components, n_features] The learned basis functions. ``self.P_[0, :, :]`` is always available, and corresponds to interactions of order ``self.degree``. ``self.P_[i, :, :]`` for i > 0 corresponds to interactions of order ``self.degree - i``, available only if ``self.fit_lower='explicit'``. self.w_ : array, shape [n_features] The learned linear model, completing the FM. Only present if ``self.fit_linear`` is true. self.lams_ : array, shape [n_components] The predictive weights. References ---------- Polynomial Networks and Factorization Machines: New Insights and Efficient Training Algorithms. Mathieu Blondel, Masakazu Ishihata, Akinori Fujino, Naonori Ueda. In: Proceedings of ICML 2016. http://mblondel.org/publications/mblondel-icml2016.pdf Factorization machines. Steffen Rendle In: Proceedings of IEEE 2010. """ def __init__(self, degree=2, loss='squared_hinge', n_components=2, alpha=1, beta=1, tol=1e-6, fit_lower='explicit', fit_linear=True, warm_start=False, init_lambdas='ones', max_iter=10000, verbose=False, random_state=None): super(FactorizationMachineClassifier, self).__init__( degree, loss, n_components, alpha, beta, tol, fit_lower, fit_linear, warm_start, init_lambdas, max_iter, verbose, random_state)