Python scipy.sparse.spdiags() Examples

The following are 30 code examples of scipy.sparse.spdiags(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.sparse , or try the search function .
Example #1
Source File: keyword_based.py    From conversational-datasets with Apache License 2.0 6 votes vote down vote up
def train(self, contexts, responses):
        """Fit the tf-idf transform and compute idf statistics."""
        with ignore_warnings():
            # Ignore deprecated `non_negative` warning.
            self._vectorizer = HashingVectorizer(non_negative=True)
        self._tfidf_transform = TfidfTransformer()
        count_matrix = self._tfidf_transform.fit_transform(
            self._vectorizer.transform(contexts + responses))
        n_samples, n_features = count_matrix.shape
        df = _document_frequency(count_matrix)
        idf = np.log((n_samples - df + 0.5) / (df + 0.5))
        self._idf_diag = sp.spdiags(
            idf, diags=0, m=n_features, n=n_features
        )
        document_lengths = count_matrix.sum(axis=1)
        self._average_document_length = np.mean(document_lengths)
        print(self._average_document_length) 
Example #2
Source File: xc_metrics.py    From pyxclib with MIT License 6 votes vote down vote up
def _setup_metric(X, true_labels, inv_psp=None, k=5):
    assert compatible_shapes(X, true_labels), \
        "ground truth and prediction matrices must have same shape."
    num_instances, num_labels = true_labels.shape
    indices = _get_topk(X, num_labels, k)
    ps_indices = None
    if inv_psp is not None:
        ps_indices = _get_topk(
            true_labels.dot(
                sp.spdiags(inv_psp, diags=0,
                           m=num_labels, n=num_labels)),
            num_labels, k)
        inv_psp = np.hstack([inv_psp, np.zeros((1))])

    true_labels = sp.hstack([true_labels,
                             sp.lil_matrix((num_instances, 1),
                                           dtype=np.int32)]).tocsr()
    return indices, true_labels, ps_indices, inv_psp 
Example #3
Source File: large_scale.py    From Computable with MIT License 6 votes vote down vote up
def sakurai(n):
    """ Example taken from
        T. Sakurai, H. Tadano, Y. Inadomi and U. Nagashima
        A moment-based method for large-scale generalized eigenvalue problems
        Appl. Num. Anal. Comp. Math. Vol. 1 No. 2 (2004) """

    A = sparse.eye(n, n)
    d0 = array(r_[5,6*ones(n-2),5])
    d1 = -4*ones(n)
    d2 = ones(n)
    B = sparse.spdiags([d2,d1,d0,d1,d2],[-2,-1,0,1,2],n,n)

    k = arange(1,n+1)
    w_ex = sort(1./(16.*pow(cos(0.5*k*pi/(n+1)),4)))  # exact eigenvalues

    return A,B, w_ex 
Example #4
Source File: test_linsolve.py    From Computable with MIT License 6 votes vote down vote up
def test_twodiags(self):
        A = spdiags([[1, 2, 3, 4, 5], [6, 5, 8, 9, 10]], [0, 1], 5, 5)
        b = array([1, 2, 3, 4, 5])

        # condition number of A
        cond_A = norm(A.todense(),2) * norm(inv(A.todense()),2)

        for t in ['f','d','F','D']:
            eps = finfo(t).eps  # floating point epsilon
            b = b.astype(t)

            for format in ['csc','csr']:
                Asp = A.astype(t).asformat(format)

                x = spsolve(Asp,b)

                assert_(norm(b - Asp*x) < 10 * cond_A * eps) 
Example #5
Source File: test_umfpack.py    From Computable with MIT License 6 votes vote down vote up
def setUp(self):
        random.seed(0)  # make tests repeatable
        self.real_matrices = []
        self.real_matrices.append(spdiags([[1, 2, 3, 4, 5], [6, 5, 8, 9, 10]],
                                          [0, 1], 5, 5))
        self.real_matrices.append(spdiags([[1, 2, 3, 4, 5], [6, 5, 8, 9, 10]],
                                          [0, 1], 4, 5))
        self.real_matrices.append(spdiags([[1, 2, 3, 4, 5], [6, 5, 8, 9, 10]],
                                          [0, 2], 5, 5))
        self.real_matrices.append(rand(3,3))
        self.real_matrices.append(rand(5,4))
        self.real_matrices.append(rand(4,5))

        self.real_matrices = [csc_matrix(x).astype('d') for x
                in self.real_matrices]
        self.complex_matrices = [x.astype(np.complex128)
                                 for x in self.real_matrices]

        _DeprecationAccept.setUp(self)

# Skip methods if umfpack not present 
Example #6
Source File: STFIWF.py    From 2016CCF-sougou with Apache License 2.0 6 votes vote down vote up
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic 
Example #7
Source File: support.py    From ektelo with Apache License 2.0 6 votes vote down vote up
def expansion_matrix(mapping, canonical_order=False):
    """ Returns an n x m matrix E where n is the dimension of 
        the original data and m is the dimension of the reduced data.

        Expands data vector x with E x'
        Reduces workload matrix W with W E
    """
    assert mapping.ndim == 1, "Can only handle 1-dimesional mappings for now, domain should be flattened"

    unique, indices, inverse, counts = mapping_statistics(mapping)

    if canonical_order:
        mapping = canonical_ordering(mapping)

    n = mapping.size
    m = unique.size
    data = np.ones(n)
    cols = np.arange(n)
    rows = inverse

    R = sparse.csr_matrix((data, (rows, cols)), shape=(m, n), dtype=int)
    scale = sparse.spdiags(1.0 /counts, 0, m, m)

    return EkteloMatrix(R.T * scale) 
Example #8
Source File: spectral_graph_partition.py    From graph-partition-neural-network-samples with MIT License 6 votes vote down vote up
def compute_laplacian_sparse(W):

  num_node = W.shape[0]
  D = np.sum(W, axis=0).A1

  # unnormalized graph laplacian
  # L = np.diag(D) - W

  # normalized graph laplacian
  idx = D != 0
  diag_vec = np.zeros_like(D)
  diag_vec[idx] = 1.0

  row = np.nonzero(D)[0]
  col = row
  diag_val = csr_matrix(
      (diag_vec, (row, col)), shape=[num_node, num_node], dtype=np.float32)

  D_inv = np.zeros_like(D)
  D_inv[idx] = 1.0 / D[idx]

  L = diag_val - spdiags(D_inv, 0, num_node, num_node) * W

  return L 
Example #9
Source File: DataDrivenRelation.py    From data_pipeline with Apache License 2.0 6 votes vote down vote up
def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            n_samples=float(n_samples)
            df = _document_frequency(X)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            # idf = np.log(df / n_samples)
            idf = old_div(df, n_samples)
            self._idf_diag = sp.spdiags(idf,
                diags=0, m=n_features, n=n_features)
        return self 
Example #10
Source File: test_umfpack.py    From scikit-umfpack with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setUp(self):
        random.seed(0)  # make tests repeatable
        real_matrices = []
        real_matrices.append(spdiags([[1, 2, 3, 4, 5], [6, 5, 8, 9, 10]],
                                     [0, 1], 5, 5))
        real_matrices.append(spdiags([[1, 2, 3, 4, 5], [6, 5, 8, 9, 10]],
                                     [0, 1], 4, 5))
        real_matrices.append(spdiags([[1, 2, 3, 4, 5], [6, 5, 8, 9, 10]],
                                     [0, 2], 5, 5))
        real_matrices.append(rand(3,3))
        real_matrices.append(rand(5,4))
        real_matrices.append(rand(4,5))

        self.real_matrices = [csc_matrix(x).astype('d')
                              for x in real_matrices]
        self.complex_matrices = [x.astype(np.complex128)
                                 for x in self.real_matrices]

        self.real_int64_matrices = [_to_int64(x)
                                   for x in self.real_matrices]
        self.complex_int64_matrices = [_to_int64(x)
                                      for x in self.complex_matrices]

        _DeprecationAccept.setUp(self) 
Example #11
Source File: vectorizers.py    From textvec with MIT License 6 votes vote down vote up
def transform(self, X, min_freq=1):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1
        tp = self._tp
        fn = self._fn

        f = self._n_features

        k = np.log(
            2 + tp / fn ** (2 / (2 - ((tp >= min_freq) | (fn >= min_freq)).astype(int))))

        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X


# TODO: add frequency source 
Example #12
Source File: vectorizers.py    From textvec with MIT License 6 votes vote down vote up
def transform(self, X, min_freq=1):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1
        tp = self._tp
        fn = self._fn

        f = self._n_features

        k = np.log2(
            1 + (2 / (2 - ((tp >= min_freq) | (fn >= min_freq)).astype(int))))

        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X 
Example #13
Source File: vectorizers.py    From textvec with MIT License 6 votes vote down vote up
def transform(self, X):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1

        tp = self._tp
        fn = self._fn

        f = self._n_features

        k = np.log2(2 + tp / fn)

        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X 
Example #14
Source File: vectorizers.py    From textvec with MIT License 6 votes vote down vote up
def transform(self, X):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1

        tp = self._tp
        fp = self._fp
        fn = self._fn
        tn = self._tn

        n = self._n_samples

        f = self._n_features

        k = n * (tp * tn - fp * fn)**2
        v = (tp + fp) * (fn + tn) * (tp + fn) * (fp + tn)
        k *= v

        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X 
Example #15
Source File: vectorizers.py    From textvec with MIT License 6 votes vote down vote up
def transform(self, X):
        tp = self._tp
        fp = self._fp
        fn = self._fn
        tn = self._tn

        f = self._n_features
        tpr = tp / self._p
        fpr = fp / self._n
        min_bound, max_bound = 0.0005, 1 - 0.0005
        tpr[tpr < min_bound]  = min_bound
        tpr[tpr > max_bound]  = max_bound
        fpr[fpr < min_bound]  = min_bound
        fpr[fpr > max_bound]  = max_bound
        k = np.abs(norm.ppf(tpr) - norm.ppf(fpr))
        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)
        return X 
Example #16
Source File: text.py    From Splunking-Crime with GNU Affero General Public License v3.0 6 votes vote down vote up
def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_samples) / df) + 1.0
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self 
Example #17
Source File: cmm.py    From cmm with GNU General Public License v2.0 6 votes vote down vote up
def manifold_harmonics(verts, tris, K, scaled=True, return_D=False, return_eigenvalues=False):
    Q, vertex_area = compute_mesh_laplacian(
        verts, tris, 'cotangent', 
        return_vertex_area=True, area_type='lumped_mass'
    )
    if scaled:
        D = sparse.spdiags(vertex_area, 0, len(verts), len(verts))
    else:
        D = sparse.spdiags(np.ones_like(vertex_area), 0, len(verts), len(verts))

    try:
        lambda_dense, Phi_dense = eigsh(-Q, M=D, k=K, sigma=0)
    except RuntimeError, e:
        if e.message == 'Factor is exactly singular':
            logging.warn("factor is singular, trying some regularization and cholmod")
            chol_solve = factorized(-Q + sparse.eye(Q.shape[0]) * 1.e-9)
            OPinv = sparse.linalg.LinearOperator(Q.shape, matvec=chol_solve)
            lambda_dense, Phi_dense = eigsh(-Q, M=D, k=K, sigma=0, OPinv=OPinv)
        else:
            raise e 
Example #18
Source File: geodesic.py    From cmm with GNU General Public License v2.0 6 votes vote down vote up
def __init__(self, verts, tris, m=1.0):
        self._verts = verts
        self._tris = tris
        # precompute some stuff needed later on
        e01 = verts[tris[:,1]] - verts[tris[:,0]]
        e12 = verts[tris[:,2]] - verts[tris[:,1]]
        e20 = verts[tris[:,0]] - verts[tris[:,2]]
        self._triangle_area = .5 * veclen(np.cross(e01, e12))
        unit_normal = normalized(np.cross(normalized(e01), normalized(e12)))
        self._un = unit_normal
        self._unit_normal_cross_e01 = np.cross(unit_normal, -e01)
        self._unit_normal_cross_e12 = np.cross(unit_normal, -e12)
        self._unit_normal_cross_e20 = np.cross(unit_normal, -e20)
        # parameters for heat method
        h = np.mean(map(veclen, [e01, e12, e20]))
        t = m * h ** 2
        # pre-factorize poisson systems
        Lc, vertex_area = compute_mesh_laplacian(verts, tris, area_type='lumped_mass')
        A = sparse.spdiags(vertex_area, 0, len(verts), len(verts))
        #self._factored_AtLc = splu((A - t * Lc).tocsc()).solve
        self._factored_AtLc = factorized((A - t * Lc).tocsc())
        #self._factored_L = splu(Lc.tocsc()).solve
        self._factored_L = factorized(Lc.tocsc()) 
Example #19
Source File: calibration.py    From qkit with GNU General Public License v2.0 6 votes vote down vote up
def _baseline_als(self,y, lam, p, niter=10):
        '''
        see http://zanran_storage.s3.amazonaws.com/www.science.uva.nl/ContentPages/443199618.pdf
        "Asymmetric Least Squares Smoothing" by P. Eilers and H. Boelens in 2005.
        http://stackoverflow.com/questions/29156532/python-baseline-correction-library
        "There are two parameters: p for asymmetry and lambda for smoothness. Both have to be
        tuned to the data at hand. We found that generally 0.001<=p<=0.1 is a good choice
        (for a signal with positive peaks) and 10e2<=lambda<=10e9, but exceptions may occur."
        '''
        L = len(y)
        D = sparse.csc_matrix(np.diff(np.eye(L), 2))
        w = np.ones(L)
        for i in xrange(niter):
            W = sparse.spdiags(w, 0, L, L)
            Z = W + lam * D.dot(D.transpose())
            z = sparse.linalg.spsolve(Z, w*y)
            w = p * (y > z) + (1-p) * (y < z)
        return z 
Example #20
Source File: STFIWF.py    From 2016_CCFsougou2 with MIT License 6 votes vote down vote up
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic 
Example #21
Source File: vectorizers.py    From textvec with MIT License 5 votes vote down vote up
def transform(self, X):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1

        tp = self._tp
        fp = self._fp
        fn = self._fn
        tn = self._tn

        n = self._n_samples

        f = self._n_features

        k = -((tp + fp) / n) * np.log((tp + fp) / n)
        k -= ((fn + tn) / n) * np.log((fn + tn) / n)
        k += (tp / n) * np.log(tp / (tp + fn))
        k += (fn / n) * np.log(fn / (tp + fn))
        k += (fp / n) * np.log(fp / (fp + tn))
        k += (tn / n) * np.log(tn / (fp + tn))

        d = -((tp + fp) / n) * np.log((tp + fp) / n)
        d -= ((fn + tn) / n) * np.log((fn + tn) / n)

        k *= d

        X = X * sp.spdiags(k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)

        return X 
Example #22
Source File: algebraicconnectivity.py    From Carnets with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _get_fiedler_func(method):
    """Returns a function that solves the Fiedler eigenvalue problem.
    """
    if method == "tracemin":  # old style keyword <v2.1
        method = "tracemin_pcg"
    if method in ("tracemin_pcg", "tracemin_chol", "tracemin_lu"):
        def find_fiedler(L, x, normalized, tol, seed):
            q = 1 if method == 'tracemin_pcg' else min(4, L.shape[0] - 1)
            X = asmatrix(seed.normal(size=(q, L.shape[0]))).T
            sigma, X = _tracemin_fiedler(L, X, normalized, tol, method)
            return sigma[0], X[:, 0]
    elif method == 'lanczos' or method == 'lobpcg':
        def find_fiedler(L, x, normalized, tol, seed):
            L = csc_matrix(L, dtype=float)
            n = L.shape[0]
            if normalized:
                D = spdiags(1. / sqrt(L.diagonal()), [0], n, n, format='csc')
                L = D * L * D
            if method == 'lanczos' or n < 10:
                # Avoid LOBPCG when n < 10 due to
                # https://github.com/scipy/scipy/issues/3592
                # https://github.com/scipy/scipy/pull/3594
                sigma, X = eigsh(L, 2, which='SM', tol=tol,
                                 return_eigenvectors=True)
                return sigma[1], X[:, 1]
            else:
                X = asarray(asmatrix(x).T)
                M = spdiags(1. / L.diagonal(), [0], n, n)
                Y = ones(n)
                if normalized:
                    Y /= D.diagonal()
                sigma, X = lobpcg(L, X, M=M, Y=asmatrix(Y).T, tol=tol,
                                  maxiter=n, largest=False)
                return sigma[0], X[:, 0]
    else:
        raise nx.NetworkXError("unknown method '%s'." % method)

    return find_fiedler 
Example #23
Source File: layout.py    From aws-kube-codesuite with Apache License 2.0 5 votes vote down vote up
def _sparse_spectral(A, dim=2):
    # Input adjacency matrix A
    # Uses sparse eigenvalue solver from scipy
    # Could use multilevel methods here, see Koren "On spectral graph drawing"
    try:
        import numpy as np
        from scipy.sparse import spdiags
        from scipy.sparse.linalg.eigen import eigsh
    except ImportError:
        msg = "_sparse_spectral() requires scipy & numpy: http://scipy.org/ "
        raise ImportError(msg)
    try:
        nnodes, _ = A.shape
    except AttributeError:
        msg = "sparse_spectral() takes an adjacency matrix as input"
        raise nx.NetworkXError(msg)

    # form Laplacian matrix
    data = np.asarray(A.sum(axis=1).T)
    D = spdiags(data, 0, nnodes, nnodes)
    L = D - A

    k = dim + 1
    # number of Lanczos vectors for ARPACK solver.What is the right scaling?
    ncv = max(2 * k + 1, int(np.sqrt(nnodes)))
    # return smallest k eigenvalues and eigenvectors
    eigenvalues, eigenvectors = eigsh(L, k, which='SM', ncv=ncv)
    index = np.argsort(eigenvalues)[1:k]  # 0 index is zero eigenvalue
    return np.real(eigenvectors[:, index]) 
Example #24
Source File: utils.py    From PyPardisoProject with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def create_test_A_b_small(matrix=False, sort_indices=True):
    """
    --- A ---
    scipy.sparse.csr.csr_matrix, float64
    matrix([[5, 1, 0, 0, 0],
            [0, 6, 2, 0, 0],
            [0, 0, 7, 3, 0],
            [0, 0, 0, 8, 4],
            [0, 0, 0, 0, 9]])
    --- b ---
    np.ndarray, float64
    array([[ 1],
           [ 4],
           [ 7],
           [10],
           [13]])
    or       
    array([[ 1,  2,  3],
           [ 4,  5,  6],
           [ 7,  8,  9],
           [10, 11, 12],
           [13, 14, 15]])
    
    """
    A = sp.spdiags(np.arange(10, dtype=np.float64).reshape(2,5), [1, 0], 5, 5, format='csr')
    if sort_indices:
        A.sort_indices()
    b = np.arange(1,16, dtype=np.float64).reshape(5,3)
    if matrix:
        return A, b
    else:
        return A, b[:,[0]] 
Example #25
Source File: vectorizers.py    From textvec with MIT License 5 votes vote down vote up
def transform(self, X, min_freq=1):
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1
        f = self._n_features
        X = X * sp.spdiags(self.k, 0, f, f)
        if self.norm:
            X = normalize(X, self.norm, copy=False)
        return X 
Example #26
Source File: vectorizers.py    From textvec with MIT License 5 votes vote down vote up
def fit(self, X, y):

        n_samples, n_features = X.shape

        pos_samples = sp.spdiags(y, 0, n_samples, n_samples)
        neg_samples = sp.spdiags(1 - y, 0, n_samples, n_samples)

        X_pos = pos_samples * X
        X_neg = neg_samples * X

        tp = np.bincount(X_pos.indices, minlength=n_features)
        fp = np.sum(y) - tp
        tn = np.bincount(X_neg.indices, minlength=n_features)
        fn = np.sum(1 - y) - tn

        self._n_samples = n_samples
        self._n_features = n_features

        self._tp = tp
        self._fp = fp
        self._fn = fn
        self._tn = tn
        self._p = np.sum(y)
        self._n = np.sum(1 - y)

        if self.smooth_df:
            self._n_samples += int(self.smooth_df)
            self._tp += int(self.smooth_df)
            self._fp += int(self.smooth_df)
            self._fn += int(self.smooth_df)
            self._tn += int(self.smooth_df)
        return self 
Example #27
Source File: hp.py    From MTSAnomalyDetection with Apache License 2.0 5 votes vote down vote up
def hp_filter(x, lamb=5000):
    w = len(x)
    b = [[1] * w, [-2] * w, [1] * w]
    D = sparse.spdiags(b, [0, 1, 2], w - 2, w)
    I = sparse.eye(w)
    B = (I + lamb * (D.transpose() * D))
    return sparse.linalg.dsolve.spsolve(B, x) 
Example #28
Source File: vectorizers.py    From textvec with MIT License 5 votes vote down vote up
def fit(self, X, y):
        n_samples, n_features = X.shape
        samples = []
        for i, val in enumerate(y.unique()):
            class_y = y == val
            class_p = np.sum(class_y)
            class_n = np.sum(1 - class_y)
            pos_samples = sp.spdiags(class_y, 0, n_samples, n_samples)
            neg_samples = sp.spdiags(1 - class_y, 0, n_samples, n_samples)

            class_X_pos = pos_samples * X
            class_X_neg = neg_samples * X
            tp = np.bincount(class_X_pos.indices, minlength=n_features)
            fp = class_p - tp
            tn = np.bincount(class_X_neg.indices, minlength=n_features)
            fn = class_n - tn
            tpr = tp / class_p
            fpr = fp / class_n
            min_bound, max_bound = 0.0005, 1 - 0.0005
            tpr[tpr < min_bound]  = min_bound
            tpr[tpr > max_bound]  = max_bound
            fpr[fpr < min_bound]  = min_bound
            fpr[fpr > max_bound]  = max_bound
            samples.append(norm.ppf(tpr) - norm.ppf(fpr))
        samples = np.array(samples)
        self.k = np.max(samples, axis=0)
        self._n_features = n_features
        return self 
Example #29
Source File: vectorizers.py    From textvec with MIT License 5 votes vote down vote up
def transform(self, X, confidence=False):
        """
        Parameters
        ----------
        confidence : boolean, default=False
            Return bool vector states that feature if in 95% confidence interval.
        """
        if self.sublinear_tf:
            X = ensure_sparse_format(X)
            np.log(X.data, X.data)
            X.data += 1

        tp = self._tp
        fp = self._fp
        fn = self._fn
        tn = self._tn

        f = self._n_features

        k = np.log((tp * tn) / (fp * fn))
        X = X * sp.spdiags(k, 0, f, f)

        if self.norm:
            X = normalize(X, self.norm, copy=False)

        if confidence:
            up = np.exp(k + 1.96 * np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn))
            low = np.exp(k - 1.96 * np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn))
            return X, (up < 1.0) | (low > 1.0)
        return X 
Example #30
Source File: vectorizers.py    From textvec with MIT License 5 votes vote down vote up
def fit(self, X, y):
        n_samples, n_features = X.shape
        samples = []
        self.number_of_classes = len(np.unique(y))
        for val in range(self.number_of_classes):
            class_mask = sp.spdiags(y == val, 0, n_samples, n_samples)
            samples.append(np.bincount(
                (class_mask * X).indices, minlength=n_features))
        samples = np.array(samples)
        self.corpus_occurence = np.sum(samples != 0, axis=0)
        self.k = np.log2(1 + (self.number_of_classes / self.corpus_occurence))
        self._n_features = n_features
        return self