Python sklearn.preprocessing.scale() Examples

The following are 30 code examples for showing how to use sklearn.preprocessing.scale(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.preprocessing , or try the search function .

Example 1
Project: geosketch   Author: brianhie   File: umbilical.py    License: MIT License 6 votes vote down vote up
def violin_jitter(X, genes, gene, labels, focus, background=None,
                  xlabels=None):
    gidx = list(genes).index(gene)

    focus_idx = focus == labels
    if background is None:
        background_idx = focus != labels
    else:
        background_idx = background == labels

    if xlabels is None:
        xlabels = [ 'Background', 'Focus' ]

    x_gene = X[:, gidx].toarray().flatten()
    x_focus = x_gene[focus_idx]
    x_background = x_gene[background_idx]
    
    plt.figure()
    sns.violinplot(data=[ x_focus, x_background ], scale='width', cut=0)
    sns.stripplot(data=[ x_focus, x_background ], jitter=True, color='black', size=1)
    plt.xticks([0, 1], xlabels)
    plt.savefig('{}_violin_{}.png'.format(NAMESPACE, gene)) 
Example 2
Project: Awesome-RecSystem-Models   Author: JianzhouZhan   File: FFM_Multi_PyTorch.py    License: MIT License 6 votes vote down vote up
def train_FFM_model_demo():

    # Step1: 导入数据
    x_train, y_train, x_test, y_test, feature2field = load_dataset()
    x_train = preprocessing.scale(x_train, with_mean=True, with_std=True)
    x_test = preprocessing.scale(x_test, with_mean=True, with_std=True)
    class_num = len(set([y for y in y_train] + [y for y in y_test]))

    # FFM模型
    ffm = FFM_layer(field_map_dict=feature2field, fea_num=x_train.shape[1], reg_l1=0.01, reg_l2=0.01,
                    class_num=class_num, latent_factor_dim=10).to(DEVICE)

    # 定义损失函数还有优化器
    optm = torch.optim.Adam(ffm.parameters())

    train_loader = get_batch_loader(x_train, y_train, BATCH_SIZE, shuffle=True)
    test_loader = get_batch_loader(x_test, y_test, BATCH_SIZE, shuffle=False)

    for epoch in range(1, EPOCHS + 1):
        train(ffm, DEVICE, train_loader, optm, epoch)
        test(ffm, DEVICE, test_loader) 
Example 3
Project: Awesome-RecSystem-Models   Author: JianzhouZhan   File: FM_Multi_PyTorch.py    License: MIT License 6 votes vote down vote up
def train_FM_model_demo():

    # Step1: 导入数据
    x_train, y_train, x_test, y_test = load_dataset()
    x_train = preprocessing.scale(x_train, with_mean=True, with_std=True)
    x_test = preprocessing.scale(x_test, with_mean=True, with_std=True)
    class_num = len(set([y for y in y_train] + [y for y in y_test]))

    # FM模型
    fm = FM_layer(class_num=class_num, feature_num=x_train.shape[1], latent_factor_dim=40).to(DEVICE)

    # 定义损失函数还有优化器
    optm = torch.optim.Adam(fm.parameters())

    train_loader = get_batch_loader(x_train, y_train, BATCH_SIZE, shuffle=True)
    test_loader = get_batch_loader(x_test, y_test, BATCH_SIZE, shuffle=False)

    for epoch in range(1, EPOCHS + 1):
        train(fm, DEVICE, train_loader, optm, epoch)
        test(fm, DEVICE, test_loader) 
Example 4
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_logistic.py    License: MIT License 6 votes vote down vote up
def test_elastic_net_versus_sgd(C, l1_ratio):
    # Compare elasticnet penalty in LogisticRegression() and SGD(loss='log')
    n_samples = 500
    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                               n_informative=5, n_redundant=0, n_repeated=0,
                               random_state=1)
    X = scale(X)

    sgd = SGDClassifier(
        penalty='elasticnet', random_state=1, fit_intercept=False, tol=-np.inf,
        max_iter=2000, l1_ratio=l1_ratio, alpha=1. / C / n_samples, loss='log')
    log = LogisticRegression(
        penalty='elasticnet', random_state=1, fit_intercept=False, tol=1e-5,
        max_iter=1000, l1_ratio=l1_ratio, C=C, solver='saga')

    sgd.fit(X, y)
    log.fit(X, y)
    assert_array_almost_equal(sgd.coef_, log.coef_, decimal=1) 
Example 5
Project: ssbio   Author: SBRG   File: atlas3.py    License: MIT License 6 votes vote down vote up
def run_pca(self, whiten=True):
        # Normalize
        for_pca_df = self.features_df.T
        for_pca_df_scaled = pd.DataFrame(preprocessing.scale(for_pca_df), columns=for_pca_df.columns)

        # Run PCA
        self.num_components = min(len(for_pca_df.T.columns), len(for_pca_df.T.index))
        pca = PCA(n_components=self.num_components, whiten=whiten)
        pca_fit = pca.fit_transform(for_pca_df_scaled)
        self.pc_names_list = ['PC{} ({:.0%})'.format(x + 1, pca.explained_variance_ratio_[x]) for x in
                                  range(self.num_components)]
        self.pc_names_dict = {k.split(' ')[0]: k for k in self.pc_names_list}
        principal_df = pd.DataFrame(data=pca_fit, columns=self.pc_names_list, index=for_pca_df.index)
        principal_df.index.name = 'strain'

        self.principal_df = principal_df
        self.pca = pca
        # self.principal_observations_df = self.principal_df.join(self.observations_df, how='inner')
        #
        # # Make iterable list of markers
        # mks = itertools.cycle(["<", "+", "o", 'D', 'x', '^', '*', '8', 's', 'p', 'v', 'X', '_', 'h'])
        # self.markers = [next(mks) for i in range(len(self.principal_observations_df[self.observation_colname].unique()))] 
Example 6
Project: MultipleFactorRiskModel   Author: icezerowjj   File: Get_flow_ev.py    License: MIT License 6 votes vote down vote up
def get_ind_return(data):
    '''
    将从xlsx中读取出来按列拼接好的数据进行重组,计算出每个行业每个月的收益率
    :param [DataFrame] data: 从xlsx文件中读取的月份-交易数据
    :return: [DataFrame] ind_ret: 月份*行业 每个行业每个月的收益率
    '''
    # 读入stk_ind_pair.xlsx,用作股票和其所属行业的对照表
    stk_ind = pd.read_excel('E:\\QuantProject2\\temp_data\\stk_ind_pair.xlsx')
    # 把stk_ind里面股票代码数字部分后面的字母去掉
    stk_ind.Stkcd = stk_ind.Stkcd.apply(lambda x: x[:6])
    # 对stk_ind和data进行merge操作,将行业信息插入data
    data = pd.merge(data, stk_ind, on='Stkcd')
    # 按照月份和行业分组
    groups = data.groupby(['Trdmnt', 'ind'])
    # 分组计算每个月每个行业的总市值
    total_Ms = groups['Msmvttl'].sum()
    # 分组计算每个月每个行业按照市值加权的收益率
    total_Mr=groups['total_Mr'].sum()
    # 相除得到每个月每个行业的平均收益率
    ind_ret=total_Mr/total_Ms
    # 将ind_ret的内层level转换为列
    ind_ret=ind_ret.unstack()
    #将ind_ret标准化
    ind_ret=pd.DataFrame(scale(ind_ret),columns=ind_ret.columns)
    return ind_ret 
Example 7
Project: MultipleFactorRiskModel   Author: icezerowjj   File: industry_return.py    License: MIT License 6 votes vote down vote up
def get_ind_return(data):
    '''
    将从xlsx中读取出来按列拼接好的数据进行重组,计算出每个行业每个月的收益率
    :param [DataFrame] data: 从xlsx文件中读取的月份-交易数据
    :return: [DataFrame] ind_ret: 月份*行业 每个行业每个月的收益率
    '''
    # 读入stk_ind_pair.xlsx,用作股票和其所属行业的对照表
    stk_ind = pd.read_excel('E:\\QuantProject2\\temp_data\\stk_ind_pair.xlsx')
    # 把stk_ind里面股票代码数字部分后面的字母去掉
    stk_ind.Stkcd = stk_ind.Stkcd.apply(lambda x: x[:6])
    # 对stk_ind和data进行merge操作,将行业信息插入data
    data = pd.merge(data, stk_ind, on='Stkcd')
    # 按照月份和行业分组
    groups = data.groupby(['Trdmnt', 'ind'])
    # 分组计算每个月每个行业的总市值
    total_Ms = groups['Msmvttl'].sum()
    # 分组计算每个月每个行业按照市值加权的收益率
    total_Mr=groups['total_Mr'].sum()
    # 相除得到每个月每个行业的平均收益率
    ind_ret=total_Mr/total_Ms
    # 将ind_ret的内层level转换为列
    ind_ret=ind_ret.unstack()
    #将ind_ret标准化
    ind_ret=pd.DataFrame(scale(ind_ret),columns=ind_ret.columns)
    return ind_ret 
Example 8
Project: smallrnaseq   Author: dmnfarrell   File: analysis.py    License: GNU General Public License v3.0 6 votes vote down vote up
def do_pca(X, c=3):
    """Do PCA"""

    from sklearn import preprocessing
    from sklearn.decomposition.pca import PCA, RandomizedPCA
    #do PCA
    #S = standardize_data(X)
    S = pd.DataFrame(preprocessing.scale(X),columns = X.columns)
    pca = PCA(n_components=c)
    pca.fit(S)
    print (pca.explained_variance_ratio_)
    #print pca.components_
    w = pd.DataFrame(pca.components_,columns=S.columns)#,index=['PC1','PC2'])
    #print w.T.max(1).sort_values()
    pX = pca.fit_transform(S)
    pX = pd.DataFrame(pX,index=X.index)
    return pX 
Example 9
Project: safekit   Author: pnnl   File: pca_autoencoder.py    License: MIT License 6 votes vote down vote up
def train(train_data, outfile):
        """
        :param train_data: A Batcher object that delivers batches of train data.
        :param outfile: (str) Where to print results.
        """
        outfile.write('day user red loss\n')
        mat = train_data.next_batch()
        while mat is not None:
            datadict = {'features': mat[:, 3:], 'red': mat[:,2], 'user': mat[:,1], 'day': mat[:,0]}
            batch = scale(datadict['features'])
            pca = PCA(n_components=1)
            pca.fit(batch)
            data_reduced = np.dot(batch, pca.components_.T) # pca transform
            data_original = np.dot(data_reduced, pca.components_) # inverse_transform
            pointloss = np.mean(np.square(batch - data_original), axis=1)
            loss = np.mean(pointloss)
            for d, u, t, l, in zip(datadict['day'].tolist(), datadict['user'].tolist(),
                                   datadict['red'].tolist(), pointloss.flatten().tolist()):
                outfile.write('%s %s %s %s\n' % (d, u, t, l))
            print('loss: %.4f' % loss)
            mat = train_data.next_batch() 
Example 10
Project: mljar-supervised   Author: mljar   File: preprocessing_utils.py    License: MIT License 6 votes vote down vote up
def is_log_scale_needed(x_org):
        x = np.array(x_org[~pd.isnull(x_org)])
        # first scale on raw data
        x = preprocessing.scale(x)
        # second scale on log data
        x_log = preprocessing.scale(np.log(x - np.min(x) + 1))

        # the old approach, let's check how new approach will work
        # original_skew = np.abs(stats.skew(x))
        # log_skew = np.abs(stats.skew(x_log))
        # return log_skew < original_skew
        ########################################################################
        # p is probability of being normal distributions
        k2, p1 = stats.normaltest(x)
        k2, p2 = stats.normaltest(x_log)

        return p2 > p1 
Example 11
Project: mljar-supervised   Author: mljar   File: test_nn.py    License: MIT License 6 votes vote down vote up
def setUpClass(cls):
        cls.X, cls.y = datasets.make_regression(
            n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
        )

        cls.params = {
            "dense_layers": 2,
            "dense_1_size": 8,
            "dense_2_size": 4,
            "dropout": 0,
            "learning_rate": 0.01,
            "momentum": 0.9,
            "decay": 0.001,
            "ml_task": "regression"
        }

        cls.y = preprocessing.scale(cls.y) 
Example 12
Project: marconibot   Author: s4w3d0ff   File: __init__.py    License: GNU General Public License v3.0 6 votes vote down vote up
def train(self, df, shuffle=True, preprocess=False, *args, **kwargs):
        """
        Takes a dataframe of features + a 'label' column and trains the lobe
        """
        if self._trained:
            logger.warning('Overwriting an already trained brain!')
            self._trained = False

        # shuffle data for good luck
        if shuffle:
            df = shuffleDataFrame(df)
        # scale train data and fit lobe
        x = df.drop('label', axis=1).values
        y = df['label'].values
        del df
        if preprocess:
            x = preprocessing.scale(x)
        logger.info('Training with %d samples', len(x))
        self.lobe.fit(x, y)
        self._trained = True 
Example 13
Project: BERMUDA   Author: txWang   File: pre_processing.py    License: MIT License 6 votes vote down vote up
def pre_processing(dataset_file_list, pre_process_paras):
    """ pre-processing of multiple datasets
    Args:
        dataset_file_list: list of filenames of datasets
        pre_process_paras: dict, parameters for pre-processing
    Returns:
        dataset_list: list of datasets
    """
    # parameters
    take_log = pre_process_paras['take_log']
    standardization = pre_process_paras['standardization']
    scaling = pre_process_paras['scaling']

    dataset_list = []
    for data_file in dataset_file_list:
        dataset = read_csv(data_file, take_log)
        if standardization:
            scale(dataset['gene_exp'], axis=1, with_mean=True, with_std=True, copy=False)
        if scaling:  # scale to [0,1]
            minmax_scale(dataset['gene_exp'], feature_range=(0, 1), axis=1, copy=False)
        dataset_list.append(dataset)
    dataset_list = intersect_dataset(dataset_list)  # retain intersection of gene symbols

    return dataset_list 
Example 14
Project: SCALE   Author: jsxlei   File: utils.py    License: MIT License 6 votes vote down vote up
def estimate_k(data):
    """
    Estimate number of groups k:
        based on random matrix theory (RTM), borrowed from SC3
        input data is (p,n) matrix, p is feature, n is sample
    """
    p, n = data.shape
    if type(data) is not np.ndarray:
        data = data.toarray()
    x = scale(data)
    muTW = (np.sqrt(n-1) + np.sqrt(p)) ** 2
    sigmaTW = (np.sqrt(n-1) + np.sqrt(p)) * (1/np.sqrt(n-1) + 1/np.sqrt(p)) ** (1/3)
    sigmaHatNaive = x.T.dot(x)

    bd = np.sqrt(p) * sigmaTW + muTW
    evals = np.linalg.eigvalsh(sigmaHatNaive)

    k = 0
    for i in range(len(evals)):
        if evals[i] > bd:
            k += 1
    return k 
Example 15
Project: cmdbac   Author: cmu-db   File: cluster.py    License: Apache License 2.0 6 votes vote down vote up
def kmeans_elbow(data):
    bin_ = Bin(0, 0)
    # processed_data = scale(data)
    data = np.array(data)
    bin_.fit(data)
    processed_data = bin_.transform(data)
    # processed_data = scale(data)

    inertias = []
    for k in K_RANGE:
        kmeans = KMeans(init='k-means++', n_clusters=k)
        kmeans.fit(processed_data)
        inertias.append(kmeans.inertia_)

    fig = plt.figure()
    plt.scatter(K_RANGE, inertias)
    plt.plot(K_RANGE, inertias)
    fig.savefig('kmeans-elbow.png') 
Example 16
Project: nltools   Author: cosanlab   File: brain_data.py    License: MIT License 6 votes vote down vote up
def scale(self, scale_val=100.):
        """ Scale all values such that they are on the range [0, scale_val],
            via grand-mean scaling. This is NOT global-scaling/intensity
            normalization. This is useful for ensuring that data is on a
            common scale (e.g. good for multiple runs, participants, etc)
            and if the default value of 100 is used, can be interpreted as
            something akin to (but not exactly) "percent signal change."
            This is consistent with default behavior in AFNI and SPM.
            Change this value to 10000 to make consistent with FSL.

        Args:
            scale_val: (int/float) what value to send the grand-mean to;
                        default 100

        """

        out = deepcopy(self)
        out.data = out.data / out.data.mean() * scale_val

        return out 
Example 17
Project: nltools   Author: cosanlab   File: brain_data.py    License: MIT License 6 votes vote down vote up
def standardize(self, axis=0, method='center'):
        ''' Standardize Brain_Data() instance.

        Args:
            axis: 0 for observations 1 for voxels
            method: ['center','zscore']

        Returns:
            Brain_Data Instance

        '''

        if axis == 1 and len(self.shape()) == 1:
            raise IndexError("Brain_Data is only 3d but standardization was requested over observations")
        out = self.copy()
        if method == 'zscore':
            with_std = True
        elif method == 'center':
            with_std = False
        else:
            raise ValueError('method must be ["center","zscore"')
        out.data = scale(out.data, axis=axis, with_std=with_std)
        return out 
Example 18
Project: neural-fingerprinting   Author: StephanZheng   File: util.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def normalize(normal, adv, noisy):
    """Z-score normalisation
    TODO
    :param normal:
    :param adv:
    :param noisy:
    :return:
    """
    n_samples = len(normal)
    total = scale(np.concatenate((normal, adv, noisy)))

    return total[:n_samples], total[n_samples:2*n_samples], total[2*n_samples:] 
Example 19
Project: Python-ELM   Author: masaponto   File: ecob_elm.py    License: MIT License 5 votes vote down vote up
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import cross_val_score

    db_name = 'iris'
    hid_num = 1000
    data_set = fetch_mldata(db_name, version=1)
    data_set.data = preprocessing.scale(data_set.data)
    data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target)

    print(db_name)
    print('ECOBELM', hid_num)
    e = ECOBELM(hid_num, c=2**5)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))

    print('ELM', hid_num)
    e = ELM(hid_num)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave)) 
Example 20
Project: whynot   Author: zykls   File: dataloader.py    License: MIT License 5 votes vote down vote up
def load_data(self):
        """Load, preprocess and class-balance the credit data."""
        rng = np.random.RandomState(self.seed)

        data = pd.read_csv(self.datapath, index_col=0)
        data.dropna(inplace=True)

        features = data.drop("SeriousDlqin2yrs", axis=1)
        # zero mean, unit variance
        features = preprocessing.scale(features)

        # add bias term
        features = np.append(features, np.ones((features.shape[0], 1)), axis=1)
        outcomes = np.array(data["SeriousDlqin2yrs"])

        # balance classes
        default_indices = np.where(outcomes == 1)[0]
        other_indices = np.where(outcomes == 0)[0][:10000]
        indices = np.concatenate((default_indices, other_indices))

        features_balanced = features[indices]
        outcomes_balanced = outcomes[indices]

        shape = features_balanced.shape

        # shuffle arrays
        shuffled = rng.permutation(len(indices))
        return features_balanced[shuffled], outcomes_balanced[shuffled] 
Example 21
Project: seizure-prediction   Author: MichaelHills   File: transforms.py    License: MIT License 5 votes vote down vote up
def get_name(self):
        return 'unit-scale' 
Example 22
Project: seizure-prediction   Author: MichaelHills   File: transforms.py    License: MIT License 5 votes vote down vote up
def apply(self, data, meta=None):
        return preprocessing.scale(data, axis=data.ndim-1) 
Example 23
Project: seizure-prediction   Author: MichaelHills   File: transforms.py    License: MIT License 5 votes vote down vote up
def get_name(self):
        return 'unit-scale-feat' 
Example 24
Project: seizure-prediction   Author: MichaelHills   File: transforms.py    License: MIT License 5 votes vote down vote up
def apply(self, data, meta=None):
        return preprocessing.scale(data.astype(np.float64), axis=0) 
Example 25
def extract_features(audio,rate):
    """extract 20 dim mfcc features from an audio, performs CMS and combines 
    delta to make it 40 dim feature vector"""    
    
    mfcc_feature = mfcc.mfcc(audio,rate, 0.025, 0.01,20,nfft = 1200, appendEnergy = True)    
    mfcc_feature = preprocessing.scale(mfcc_feature)
    delta = calculate_delta(mfcc_feature)
    combined = np.hstack((mfcc_feature,delta)) 
    return combined 
Example 26
Project: DCC   Author: shahsohil   File: edgeConstruction.py    License: MIT License 5 votes vote down vote up
def feature_transformation(features, preprocessing='normalization'):
    n_samples, n_features = features.shape
    if preprocessing == 'scale':
        features = skscale(features, copy=False)
    elif preprocessing == 'minmax':
        minmax_scale = MinMaxScaler().fit(features)
        features = minmax_scale.transform(features)
    elif preprocessing == 'normalization':
        features = np.sqrt(n_features) * normalize(features, copy=False)
    else:
        print('No preprocessing is applied')
    return features 
Example 27
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_sgd.py    License: MIT License 5 votes vote down vote up
def test_underflow_or_overlow():
    with np.errstate(all='raise'):
        # Generate some weird data with hugely unscaled features
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 10

        X = rng.normal(size=(n_samples, n_features))
        X[:, :2] *= 1e300
        assert np.isfinite(X).all()

        # Use MinMaxScaler to scale the data without introducing a numerical
        # instability (computing the standard deviation naively is not possible
        # on this data)
        X_scaled = MinMaxScaler().fit_transform(X)
        assert np.isfinite(X_scaled).all()

        # Define a ground truth on the scaled data
        ground_truth = rng.normal(size=n_features)
        y = (np.dot(X_scaled, ground_truth) > 0.).astype(np.int32)
        assert_array_equal(np.unique(y), [0, 1])

        model = SGDClassifier(alpha=0.1, loss='squared_hinge', max_iter=500)

        # smoke test: model is stable on scaled data
        model.fit(X_scaled, y)
        assert np.isfinite(model.coef_).all()

        # model is numerically unstable on unscaled data
        msg_regxp = (r"Floating-point under-/overflow occurred at epoch #.*"
                     " Scaling input data with StandardScaler or MinMaxScaler"
                     " might help.")
        assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y) 
Example 28
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_logistic.py    License: MIT License 5 votes vote down vote up
def test_logreg_l1_sparse_data():
    # Because liblinear penalizes the intercept and saga does not, we do not
    # fit the intercept to make it possible to compare the coefficients of
    # the two models at convergence.
    rng = np.random.RandomState(42)
    n_samples = 50
    X, y = make_classification(n_samples=n_samples, n_features=20,
                               random_state=0)
    X_noise = rng.normal(scale=0.1, size=(n_samples, 3))
    X_constant = np.zeros(shape=(n_samples, 2))
    X = np.concatenate((X, X_noise, X_constant), axis=1)
    X[X < 1] = 0
    X = sparse.csr_matrix(X)

    lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear',
                                      fit_intercept=False, multi_class='ovr',
                                      tol=1e-10)
    lr_liblinear.fit(X, y)

    lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                 fit_intercept=False, multi_class='ovr',
                                 max_iter=1000, tol=1e-10)
    lr_saga.fit(X, y)
    assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_)
    # Noise and constant features should be regularized to zero by the l1
    # penalty
    assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5))
    assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))

    # Check that solving on the sparse and dense data yield the same results
    lr_saga_dense = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                       fit_intercept=False, multi_class='ovr',
                                       max_iter=1000, tol=1e-10)
    lr_saga_dense.fit(X.toarray(), y)
    assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_) 
Example 29
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_logistic.py    License: MIT License 5 votes vote down vote up
def test_LogisticRegression_elastic_net_objective(C, l1_ratio):
    # Check that training with a penalty matching the objective leads
    # to a lower objective.
    # Here we train a logistic regression with l2 (a) and elasticnet (b)
    # penalties, and compute the elasticnet objective. That of a should be
    # greater than that of b (both objectives are convex).
    X, y = make_classification(n_samples=1000, n_classes=2, n_features=20,
                               n_informative=10, n_redundant=0,
                               n_repeated=0, random_state=0)
    X = scale(X)

    lr_enet = LogisticRegression(penalty='elasticnet', solver='saga',
                                 random_state=0, C=C, l1_ratio=l1_ratio,
                                 fit_intercept=False)
    lr_l2 = LogisticRegression(penalty='l2', solver='saga', random_state=0,
                               C=C, fit_intercept=False)
    lr_enet.fit(X, y)
    lr_l2.fit(X, y)

    def enet_objective(lr):
        coef = lr.coef_.ravel()
        obj = C * log_loss(y, lr.predict_proba(X))
        obj += l1_ratio * np.sum(np.abs(coef))
        obj += (1. - l1_ratio) * 0.5 * np.dot(coef, coef)
        return obj

    assert enet_objective(lr_enet) < enet_objective(lr_l2) 
Example 30
Project: MultipleFactorRiskModel   Author: icezerowjj   File: Get_flow_ev.py    License: MIT License 5 votes vote down vote up
def load_industry_data(fname_list):
    '''
    因为万德写入xlsx的数据的大小限制,数据分散在各个文件中,这里把数据纵向拼接起来并选取所需的192个月
    :param fname_list:
    :return:
    '''
    # 将所有表示交易数据的财务数据表格读入
    data = pd.DataFrame()
    for i in fname_list:
        print 'loading', i
        temp = pd.read_excel(i)
        # 剪掉前两行中文
        temp = temp.iloc[2:, :]
        # 从2000年第一个月开始截取
        temp = temp[temp.Trdmnt >= '2000-01']
        temp = temp[temp.Trdmnt < '2016-01']
        # 拼接到data
        data = pd.concat([data, temp], axis=0)
        # 计算每只股票的市值*收益率以便稍后处理
        data['total_Mr']=data.Msmvttl*data.Mretwd
        # 把月个股流通市值提取出来,用以替代tech里面的ev
        stkcd=widgets.get_selected_Stkcd()
        flow_ev=data[['Stkcd','Trdmnt','Msmvosd']]
        flow_ev=flow_ev.set_index('Stkcd',drop=False)
        flow_ev=flow_ev.ix[stkcd.values]
        flow_ev=flow_ev.set_index(['Trdmnt'],append=True)
        flow_ev=flow_ev.unstack()
        flow_ev=flow_ev['Msmvosd']
        flow_ev=flow_ev.transpose()
        flow_ev.fillna(0,inplace=True)
#        # 将ev标准化
#        ev=pd.DataFrame(scale(ev))
    # 规整index
    data.index = range(data.shape[0])
    # 将股票代码和时间作为Multi_index
    # data=data._index(['Trdmnt','Stkcd'])
    return [data,flow_ev]