Python pandas.get_dummies() Examples

The following are code examples for showing how to use pandas.get_dummies(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the ones you don't like. You can also save this page to your account.

Example 1
Project: ScoreCardModel   Author: data-science-tools   File: discretization.py    (license) View Source Project 7 votes vote down vote up
def transform(self, x):
        """
        Parameters:

            x (Sequence): - ???????

        Returns:

            np.array: - ????????????numpy??

        """
        s = pd.cut(x, bins=self.bins)
        d = pd.get_dummies(s)
        z = d.T.to_dict()
        re = []
        for i, v in z.items():
            for j, u in v.items():
                if u == 1:
                    re.append(str(j))
        return np.array(re) 
Example 2
Project: HousePricePredictionKaggle   Author: Nuwantha   File: gradient_boosting.py    (license) View Source Project 6 votes vote down vote up
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))
    
    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y 
Example 3
Project: HousePricePredictionKaggle   Author: Nuwantha   File: RandomForest.py    (license) View Source Project 6 votes vote down vote up
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))
    
    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y 
Example 4
Project: HousePricePredictionKaggle   Author: Nuwantha   File: ensemble_stacking.py    (license) View Source Project 6 votes vote down vote up
def data_preprocess(train, test):
    outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
                   478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
                   1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
    train.drop(train.index[outlier_idx], inplace=True)
    all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                          test.loc[:, 'MSSubClass':'SaleCondition']))

    to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    all_data = all_data.drop(to_delete, axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    # log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(method='ffill')
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train, X_test, y 
Example 5
Project: cloudml-samples   Author: GoogleCloudPlatform   File: model.py    (license) View Source Project 6 votes vote down vote up
def generator_input(input_file, chunk_size):
  """Generator function to produce features and labels
     needed by keras fit_generator.
  """
  input_reader = pd.read_csv(tf.gfile.Open(input_file[0]),
                           names=CSV_COLUMNS,
                           chunksize=chunk_size,
                           na_values=" ?")

  for input_data in input_reader:
    input_data = input_data.dropna()
    label = pd.get_dummies(input_data.pop(LABEL_COLUMN))

    input_data = to_numeric_features(input_data)
    n_rows = input_data.shape[0]
    return ( (input_data.iloc[[index % n_rows]], label.iloc[[index % n_rows]]) for index in itertools.count() ) 
Example 6
Project: tensorflow   Author: KirovVerst   File: titanic.py    (license) View Source Project 6 votes vote down vote up
def next_batch(df, i=None):
    """

    :param df: pandas dataframe
    :param i: batch index
    :return: (numpy array x, numpy array y)
    """
    if i is None:
        start = 0
        end = df.shape[0]
    else:
        start = BATCH_SIZE * i
        end = BATCH_SIZE * (i + 1)
    result = df[start:end]
    if "Survived" in result:
        batch_ys = pd.get_dummies(result.pop('Survived').values).as_matrix()
        batch_xs = result.as_matrix()
        return batch_xs, batch_ys
    else:
        return result.as_matrix() 
Example 7
Project: dask-ml   Author: dask   File: data.py    (license) View Source Project 6 votes vote down vote up
def transform(self, X, y=None):
        """Dummy encode the categorical columns in X

        Parameters
        ----------
        X : pd.DataFrame or dd.DataFrame
        y : ignored

        Returns
        -------
        transformed : pd.DataFrame or dd.DataFrame
            Same type as the input
        """
        if not X.columns.equals(self.columns_):
            raise ValueError("Columns of 'X' do not match the training "
                             "columns. Got {!r}, expected {!r}".format(
                                 X.columns, self.columns
                             ))
        if isinstance(X, pd.DataFrame):
            return pd.get_dummies(X, drop_first=self.drop_first)
        elif isinstance(X, dd.DataFrame):
            return dd.get_dummies(X, drop_first=self.drop_first)
        else:
            raise TypeError("Unexpected type {}".format(type(X))) 
Example 8
Project: aboleth   Author: data61   File: multi_input.py    (license) View Source Project 6 votes vote down vote up
def input_fn(df):
    """Format the downloaded data."""
    # Creates a dictionary mapping from each continuous feature column name (k)
    # to the values of that column stored in a constant Tensor.
    continuous_cols = [df[k].values for k in CONTINUOUS_COLUMNS]
    X_con = np.stack(continuous_cols).astype(np.float32).T

    # Standardise
    X_con -= X_con.mean(axis=0)
    X_con /= X_con.std(axis=0)

    # Creates a dictionary mapping from each categorical feature column name
    categ_cols = [np.where(pd.get_dummies(df[k]).values)[1][:, np.newaxis]
                  for k in CATEGORICAL_COLUMNS]
    n_values = [np.amax(c) + 1 for c in categ_cols]
    X_cat = np.concatenate(categ_cols, axis=1).astype(np.int32)

    # Converts the label column into a constant Tensor.
    label = df[LABEL_COLUMN].values[:, np.newaxis]

    # Returns the feature columns and the label.
    return X_con, X_cat, n_values, label 
Example 9
Project: strategy   Author: kanghua309   File: train_ddqn.py    (license) View Source Project 6 votes vote down vote up
def replay(self):
        """Memory Management and training of the agent
        """
        if len(self.memory) < self.batch_size:
            return

        state, action, reward, next_state, done = self._get_batches()
        reward += (self.gamma
                   * np.logical_not(done)
                   * np.amax(self.model.predict(next_state), axis=1))
        q_target = self.target_model.predict(state)

        _ = pd.Series(action)
        one_hot = pd.get_dummies(_).as_matrix()
        action_batch = np.where(one_hot == 1)
        q_target[action_batch] = reward
        return self.model.fit(state, q_target,
                              batch_size=self.batch_size,
                              epochs=1,
                              verbose=False) 
Example 10
Project: next-book   Author: EmmaOnThursday   File: recommendation_creation.py    (license) View Source Project 6 votes vote down vote up
def make_date_columns_categorical_binary(book_attributes):
    """Turn all date columns in book_attributes into binary categorical columns."""

    # bucket publish dates & insert categorical data columns into data frame
    orig_pub_year_cat = transform_pub_dates(book_attributes['original_pub_year'])
    book_attributes.insert(loc=5, column='orig_pub_year_cat', value=orig_pub_year_cat)

    pub_year_cat = transform_pub_dates(book_attributes['pub_year'])
    book_attributes.insert(loc=5, column='pub_year_cat', value=pub_year_cat)

    # turn date categories into binary dataframes; merge back into book_attributes
    pub_year_dummies = pd.get_dummies(book_attributes['pub_year_cat'])
    orig_year_dummies = pd.get_dummies(book_attributes['orig_pub_year_cat'])

    book_full_attr = book_attributes.merge(pub_year_dummies,left_index=True, right_index=True)
    book_full_attr = book_full_attr.merge(orig_year_dummies,left_index=True, right_index=True)

    return book_full_attr 
Example 11
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_user_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00'):
    '''
    ????????,??????.
    '''
    dump_path = './cache/user_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date=start_date, end_date=end_date, field=['user_id', 'time', 'type'])
        prefix = 'Action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
        type_dummies = pd.get_dummies(df['type'], prefix=prefix)
        df = pd.concat([df, type_dummies], axis=1)
        drop_cols = ['time', 'type']
        df.drop(drop_cols, axis=1, inplace=True)
        df = df.groupby(['user_id'], as_index=False).sum()
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
    return df 
Example 12
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_base_user_feat(end_date='2016-04-16'):
    '''
    ????????
    '''
    dump_path = './cache/base_user_feat_{0}.pkl'.format(end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = pd.read_csv(USER_FILE, encoding='gbk')
        # sex_dummies = pd.get_dummies(df.sex, prefix='sex')
        df.user_reg_tm.fillna('2016-02-01', inplace=True)
        df.user_reg_tm = pd.to_datetime(df.user_reg_tm).apply(lambda t: pd.to_datetime('2016-02-01') if t > pd.to_datetime('2016-04-15') else t)
        df['reg_tm_dist'] = df.user_reg_tm.apply(lambda t: (pd.to_datetime(end_date) - t).days)
        df = df[['user_id', 'user_lv_cd', 'reg_tm_dist']]
        # df = pd.concat([df, sex_dummies], axis=1)
        # age_dummies = pd.get_dummies(df.age, prefix='age')
        # N = age_dummies.shape[1]
        # age_dummies.columns = ['age_{0}'.format(i) for i in range(N)]
        # df = pd.concat([df, age_dummies], axis=1)
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
    return df 
Example 13
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_UIPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions= [1,2,3,4,5,6]):
    '''
    UI pair????
    '''
    dump_path = './cache/UIPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'sku_id', 'cate', 'type'])
        prefix = 'UIPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
        type_dummies = pd.get_dummies(df['type'], prefix=prefix)
        df = pd.concat([df, type_dummies], axis=1)
        df.drop(['type'], axis=1, inplace=True)
        df = df.groupby(['user_id', 'sku_id', 'cate'], as_index=False).sum()
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
    
    actions.sort()
    rt_cols = ['user_id', 'sku_id', 'cate']
    rt_cols.extend(['UIPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
    df = df[rt_cols]

    return df 
Example 14
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_UCPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]):
    '''
    ??UCPair???
    '''
    dump_path = './cache/UCPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'type', 'cate'])
        prefix = 'UCPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
        type_dummies = pd.get_dummies(df['type'], prefix=prefix)
        df = pd.concat([df, type_dummies], axis=1)
        df = df.groupby(['user_id', 'cate'], as_index=False).sum()
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)

    actions.sort()
    rt_cols = ['user_id', 'cate']
    rt_cols.extend(['UCPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
    df = df[rt_cols]

    return df 
Example 15
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_base_item_feat(end_date = '2016/4/16'):
    '''
    ??????
    '''
    JComment = pd.read_csv(COMMENT_FILE, encoding='gbk')
    end_date = pd.to_datetime(end_date)
    JComment.dt = pd.to_datetime(JComment.dt)
    dts = JComment.dt.drop_duplicates()
    dts.sort_index(inplace=True, ascending=False)
    for dt in dts.iteritems():
        if dt[-1] < end_date:
            break
    JComment = JComment[JComment.dt == dt[-1]].drop(['dt'], axis=1)
    Comment_num_dummies = pd.get_dummies(JComment.comment_num, prefix='Comment_num')
    JComment = pd.concat([JComment, Comment_num_dummies], axis=1)

    return JComment.drop(['comment_num'], axis=1) 
Example 16
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_item_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]):
    '''
    ??????
    '''
    dump_path = './cache/item_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['sku_id', 'type'])
        prefix = 'item_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
        type_dummies = pd.get_dummies(df['type'], prefix=prefix)
        df = pd.concat([df, type_dummies], axis=1)
        df.drop(['type'], axis=1, inplace=True)
        df = df.groupby(['sku_id'], as_index=False).sum()
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
    
    rt_cols = ['sku_id']
    rt_cols.extend(['item_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
    df = df[rt_cols]

    return df 
Example 17
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_UBPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-01 00:00:00', actions = [1,2,3,4,5,6]):
    '''
    ????????
    '''
    dump_path = './cache/UBPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'brand', 'type'])
        prefix = 'UBPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
        type_dummies = pd.get_dummies(df.type, prefix=prefix)
        df = pd.concat([df, type_dummies], axis=1)
        df.drop(['type'], axis=1, inplace=True)
        df = df.groupby(['user_id', 'brand'], as_index=False).sum()
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
    
    rt_cols = ['user_id', 'brand']
    rt_cols.extend(['UBPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
    df = df[rt_cols]

    return df 
Example 18
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_BCPair_action_cnt(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', actions=[1,2,3,4,5,6]):
    '''
    ????-??????
    '''
    dump_path = './cache/BCPair_action_cnt_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['brand', 'cate', 'type'])
        prefix = 'BCPair_action_cnt_{0}_{1}'.format(start_date[:10], end_date[:10])
        type_dummies = pd.get_dummies(df.type, prefix=prefix)
        df = pd.concat([df.drop(['type'], axis=1), type_dummies], axis=1)
        df = df.groupby(['brand', 'cate'], as_index=False).sum()
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
    
    rt_cols = ['brand', 'cate']
    rt_cols.extend(['BCPair_action_cnt_{0}_{1}_{2}'.format(start_date[:10], end_date[:10], i) for i in actions])
    df = df[rt_cols]

    return df 
Example 19
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_user_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00'):
    '''
    ????????????
    '''
    dump_path = './cache/user_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone'])
        timeZone_dummies = pd.get_dummies(df.time_zone, prefix='time_zone_cnt')
        df = pd.concat([df, timeZone_dummies], axis=1)
        df.drop(['time_zone'], axis=1, inplace=True)
        df = df.groupby(['user_id'], as_index=False).sum()

        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
    
    return df 
Example 20
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_UCPair_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]):
    '''
    ?????????????????
    '''
    dump_path = './cache/UCPair_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone', 'cate'])
        timeZone_dummies = pd.get_dummies(df.time_zone, prefix='uc_time_zone_cnt')
        df = pd.concat([df, timeZone_dummies], axis=1)
        df.drop(['time_zone'], axis=1, inplace=True)
        df = df.groupby(['user_id', 'cate'], as_index=False).sum()

        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)

    df = df[df.cate.isin(cate)]
    return df 
Example 21
Project: JData-algorithm-competition   Author: wrzto   File: features_generator.py    (license) View Source Project 6 votes vote down vote up
def load_UIPair_act_cnt_with_timeZone(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]):
    '''
    ????????????????
    '''
    dump_path = './cache/UIPair_act_cnt_with_timeZone_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'time_zone', 'sku_id'])
        timeZone_dummies = pd.get_dummies(df.time_zone, prefix='time_zone_cnt')
        df = pd.concat([df, timeZone_dummies], axis=1)
        df.drop(['time_zone'], axis=1, inplace=True)
        df = df.groupby(['user_id', 'sku_id'], as_index=False).sum()

        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)
            
    return df 
Example 22
Project: expected_goals   Author: andrebrener   File: xg_model.py    (license) View Source Project 6 votes vote down vote up
def get_table(train_table):
    x_cols = []
    for col in train_table.columns:
        # print(data[col].value_counts())
        if col not in ['result', 'team_name', 'competition', 'season_x',
                       'surname']:
            train_table[col] = train_table[col].astype(str)
            x_cols.append(col)

    # print(x_cols)

    X = pd.get_dummies(train_table[x_cols])
    y = train_table['result']

    print(train_table.shape)
    print(X.shape)
    print(y.shape)

    return X, y 
Example 23
Project: Davies_Bouldin_Index_KMeans   Author: akankshadara   File: index.py    (license) View Source Project 6 votes vote down vote up
def main():
	df = pd.read_csv("dataset.csv")
	df = df.dropna()
	# print df
	x1 = df.copy()
	del x1['Customer']
	del x1['Effective To Date']
	x4 = pd.get_dummies(x1)
	# print x4
	n = 10
	clf = k_means(x4, n_clusters = n)
	centroids = clf[0] 
	# 10 clusters
	labels = clf[1] 
	# print x4[1]
	index_db_val = compute_DB_index(x4, labels, centroids, n)
	print "The value of Davies Bouldin index for a K-Means cluser of size " + str(n) + " is: " + str(index_db_val) 
Example 24
Project: DSI-personal-reference-kit   Author: teb311   File: cleaner.py    (license) View Source Project 6 votes vote down vote up
def dummify(df):
    '''
        Given a dataframe, for all the columns which are not numericly typed already,
        create dummies. This will NOT remove one of the dummies which is required for
        linear regression.

        returns DataFrame -- a dataframe with all non-numeric columns swapped into dummy columns
    '''
    obj_cols = []
    for cname in df.columns:
        if df[cname].dtype == object:
            obj_cols.append(cname)

    df = pd.get_dummies(df, columns=obj_cols)
    # for cname in obj_cols:
    #     del df[cname]

    return df 
Example 25
Project: TextStageProcessor   Author: mhyhre   File: apriori_maker.py    (license) View Source Project 6 votes vote down vote up
def apriori_alg(trans, support=0.01, minlen=2):
    print('appr_1')
    dna = trans.unstack().dropna()
    print('appr_2')
    ts = pandas.get_dummies(dna).groupby(level=1).sum()
    print('appr_3')
    collen, rowlen = ts.shape
    pattern = []
    for cnum in range(minlen, rowlen + 1):
        for cols in combinations(ts, cnum):
            print('cnum', cnum)
            patsup = ts[list(cols)].all(axis=1).sum()
            patsup = float(patsup) / collen
            pattern.append([",".join(cols), patsup])
    print('appr_4')
    sdf = pandas.DataFrame(pattern, columns=["Pattern", "Support"])
    print('appr_5')
    results = sdf[sdf.Support >= support]
    print('appr_6')
    return results

# ????????? Apriori ?? ? ?? 
Example 26
Project: Tencent_Social_Ads   Author: freelzy   File: doFeats_2.py    (license) View Source Project 6 votes vote down vote up
def doOneHot(X_train, X_test):
    res = X_test[['instanceID']]
    X_test.drop('instanceID', axis=1, inplace=True)
    data = X_train.append(X_test, ignore_index=True)
    del X_train, X_test
    gc.collect()

    features_trans = ['gender','appCategory_main','connectionType']
    data = pd.get_dummies(data, columns=features_trans)

    X_train = data.loc[data['label'] != -1, :]
    X_test = data.loc[data['label'] == -1, :]
    X_test.loc[:, 'instanceID'] = res.values
    del data
    gc.collect()
    return X_train, X_test 
Example 27
Project: Steal-ML   Author: ftramer   File: utils.py    (license) View Source Project 6 votes vote down vote up
def prepare_gss(onehot=True):
    data = pd.read_csv('../data/GSShappiness.csv')

    del data['year']
    del data['id']

    data = data.dropna()
    target = "Happiness level"

    X = data[list(set(data.columns) - set([target]))]
    y = data[target]

    if onehot:
        X = pd.get_dummies(X)

    return X, y 
Example 28
Project: jsaicup2017   Author: SS1031   File: data_loader.py    (license) View Source Project 6 votes vote down vote up
def thunder():
    if os.path.exists('../dataset/thunder.pkl'):
        return pd.read_pickle('../dataset/thunder.pkl')

    thunder_df = pd.read_csv('../input/thunder.csv',
                             names=[
                                 'datetime',    # ????
                                 'lat',         # ??(10??)
                                 'lon',         # ??(10??)
                                 'type'         # ???, CG: ???, IC: ???
                             ])

    # ?????????
    thunder_df.datetime = pd.to_datetime(thunder_df.datetime)

    # observation_point_df.to_pickle('../dataset/observation_point.pkl')
    thunder_df = pd.concat([thunder_df, pd.get_dummies(thunder_df.type)], axis=1)
    thunder_df.to_pickle('../dataset/thunder_df.pkl')

    return thunder_df 
Example 29
Project: Titanic   Author: dataventureutc   File: prepare.py    (license) View Source Project 6 votes vote down vote up
def load_data():

    data = pd.read_csv('data/train.csv')

    # drop rows with empty features / gaps in columns
    data = data.dropna()

    # Categorical values into numerical (one hot encoding)
    one_hot_embarked = pd.get_dummies(data['Embarked'], prefix='embarked')
    data = data.join(one_hot_embarked)

    one_hot_pclass = pd.get_dummies(data['Pclass'], prefix='pclass')
    data = data.join(one_hot_pclass)

    # The sex column has only two values (M/F), so that only one column is required for encoding (0/1)
    # Intead of one hot encoding with two columns
    data['sex'] = data.apply(lambda x: 1 if (x['Sex'] == 'female') else 0, axis=1)

    # Drop features not used for training the model
    data = data.drop(['Cabin', 'Name', 'PassengerId', 'Pclass', 'Sex', 'Ticket', 'Embarked'], axis=1)

    return data.drop(['Survived'], axis=1), data[['Survived']] 
Example 30
Project: mars_express   Author: wsteitz   File: data.py    (license) View Source Project 6 votes vote down vote up
def parse_context_dmop(path):
    df = read(path, "dmop")

    # ATTT-A and ATTT-B are different
    attt = df[df['subsystem'].str.startswith("ATTT")]
    attt['subsystem'] = attt['subsystem'].str[:3] + attt['subsystem'].str[-1]
    
    df = pd.concat([attt, df])

    # take the first 4 chars
    df['subsystem'] = df['subsystem'].str[:4]

    # convert to 1 / 0
    df = pd.get_dummies(df.subsystem)
    df = df.resample("1h").sum().fillna(0.0)
    
    df['sum_dmop'] = df.sum(axis=1)

    return df 
Example 31
Project: mars_express   Author: wsteitz   File: data.py    (license) View Source Project 6 votes vote down vote up
def parse_context_ftl(path):
    raw = read(path, "ftl")

    df = raw.copy()
    df['ut_ms'] = pd.to_datetime(raw['utb_ms'], unit='ms')
    df.sort_values("ut_ms", inplace=True)
    # dummies
    df = df.set_index('ut_ms')
    dummies = pd.get_dummies(df.type).join(df['flagcomms'], how="outer")
    dummies = dummies.resample("1h").sum().fillna(0.0)

    df = raw.copy()
    df['event'] = df.type + df.flagcomms.astype("str")
    del df['type'], df['flagcomms']
    df['ute_ms'] = pd.to_datetime(df['ute_ms'], unit='ms')
    df['utb_ms'] = pd.to_datetime(df['utb_ms'], unit='ms')
    durations = [event_to_min_per_hour(df, event) for event in df.event.unique()]
    durations = pd.concat(durations, axis=1).fillna(0)

    return dummies.join(durations, how="outer") 
Example 32
Project: sklearnflask   Author: amirziai   File: main.py    (license) View Source Project 6 votes vote down vote up
def predict():
    if clf:
        try:
            json_ = request.json
            query = pd.get_dummies(pd.DataFrame(json_))

            # https://github.com/amirziai/sklearnflask/issues/3
            # Thanks to @lorenzori
            query = query.reindex(columns=model_columns, fill_value=0)

            prediction = list(clf.predict(query))

            return jsonify({'prediction': prediction})

        except Exception, e:

            return jsonify({'error': str(e), 'trace': traceback.format_exc()})
    else:
        print 'train first'
        return 'no model here' 
Example 33
Project: The_Ultimate_Student_Hunt   Author: analyticsvidhya   File: xgb.py    (license) View Source Project 6 votes vote down vote up
def preprocess(file,istrian):
	df=pd.read_csv(file,parse_dates=['Date'],dayfirst=True)
	end_missing=['Average_Atmospheric_Pressure','Max_Atmospheric_Pressure',
	'Min_Atmospheric_Pressure','Min_Ambient_Pollution','Max_Ambient_Pollution']
	df=df.fillna(-1)
	if istrian:
		outcome=df.Footfall
		df=df.drop(['Footfall'],axis=1)
	else:
		outcome=np.nan

	df['month']=df['Date'].apply(lambda x: x.month)
	df['date']=df['Date'].apply(lambda x: x.day)
	df['weekday']=df['Date'].apply(lambda x: x.weekday())
	df['sardiya']=df['month'].apply(lambda x: 1 if x in [1,2,11,12,3] else 0)
	df.date=df.date.apply(get_normal_date)
	dummies=pd.get_dummies(df.Park_ID,prefix='park')
	dummies=pd.get_dummies(df.Location_Type,prefix='location')
	df['Direction_Of_Wind2']=df.Direction_Of_Wind.apply(get_wind_dir)

	return df,outcome

#load training set 
Example 34
Project: JDcontest   Author: zsyandjyhouse   File: gen_sku_fea.py    (license) View Source Project 6 votes vote down vote up
def get_comment_product_fea(endtime):
    enddt = pd.to_datetime(endtime,format = '%Y-%m-%d')
    if enddt == pd.to_datetime('2016-04-15',format = '%Y-%m-%d'):
        commentdata = pd.read_csv(FilePath + CommentFile)
        commentdata = commentdata[(commentdata["dt"] == "2016-04-15")]
        commentdata = commentdata.sort_values(by="sku_id").reset_index()[["sku_id", "comment_num", "has_bad_comment", "bad_comment_rate"]]
        return commentdata
    else:
        startdt = enddt - pd.Timedelta(days=7)
        commentpath = FilePath + CommentFile
        commentdata_ALL = pd.read_csv(commentpath)  # ?Jdatya_comment.csv??????
        commentdata_ALL.dt = pd.to_datetime(commentdata_ALL.dt, format='%Y-%m-%d')  # ?dt????date??
        comment = commentdata_ALL[(commentdata_ALL.dt <= enddt) & (commentdata_ALL.dt > startdt)]
        df = pd.get_dummies(comment['comment_num'], prefix='comment_num')
        comment = pd.concat([comment, df], axis=1)
        comment = comment[['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3','comment_num_4']]
        sorted_comment = comment.sort_values(by=['sku_id']).reset_index().drop('index',1)
        #sorted_comment.to_csv(FilePath + 'skuFeaInComment_before'+str(enddt), index=False)
        return sorted_comment

# ???????? 
Example 35
Project: JDcontest   Author: zsyandjyhouse   File: gen_action_fea.py    (license) View Source Project 6 votes vote down vote up
def get_action_feat(start_time, end_time,action_data):
    actions=action_data[(action_data['time']>=start_time)&(action_data['time']<=end_time)]
    #actions = get_actions(start_time, end_time)
    #actions = actions[actions['cate'] == 8]
    actions = actions[['user_id', 'sku_id', 'type']]
    df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_time, end_time))
    actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
    actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
    
    actions.fillna(0,inplace=True)
    name='%s-%s-action' % (start_time, end_time)
    actions[name+'_1256']=actions[name+'_1']+actions[name+'_2']+actions[name+'_5']+actions[name+'_6']
    actions[name+'_1256_d_4']=actions[name+'_4']/actions[name+'_1256']

    del actions['type']
    # action_fea_file = 'action_fea_' + STARTdt_str + 'to' + ENDdt_str + '.csv'
    # action_fea.to_csv(FilePath + action_fea_file, index=False)
    return actions

#???????????????????? 
Example 36
Project: JDcontest   Author: zsyandjyhouse   File: gen_user_fea.py    (license) View Source Project 6 votes vote down vote up
def get_basic_user_fea():
    user = pd.read_csv(FilePath+UserFile, encoding='gbk')
    # user['age'] = user['age'].map(convert_age)
    user['age']=user['age'].replace([u'16-25?',u'26-35?',u'36-45?',u'46-55?',u'56???'],[1,2,3,4,5])
    user=user[((user['age']==1) |
                (user['age']==2) |
                ( user['age']==3) |
                (user['age']==4) |
                (user['age']==5)|
                (user['age']==-1))]
    age_df = pd.get_dummies(user["age"], prefix="age")
    sex_df = pd.get_dummies(user["sex"], prefix="sex")
    user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
    user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
    user.to_csv(FilePath + 'user_basic_fea.csv',index=False)
    return user

    #??????????????? 
Example 37
Project: tdlstm   Author: bluemonk482   File: dataprocessor.py    (license) View Source Project 6 votes vote down vote up
def next_batch(self):
		df = self.batch_df[self.pointer]
		x = np.array([d[0] for d in df])
		xl = np.array([d[1] for d in df])
		xr = np.array([d[2] for d in df])
		tar = np.array([d[3] for d in df])
		y = np.array([d[-1] for d in df])
		y = pd.get_dummies(y).values.astype(np.int32)
		seq_len = [len(seq) for seq in x]
		seq_len_l = [len(seq) for seq in xl]
		seq_len_r = [len(seq) for seq in xr]
		if self.dynamic_padding:
			x = np.array(self.pad_minibatches(x, 'RIGHT'))
			xl = np.array(self.pad_minibatches(xl, 'RIGHT'))
			xr = np.array(self.pad_minibatches(xr, 'RIGHT'))
		self.pointer += 1
		return x, y, seq_len, xl, seq_len_l, xr, seq_len_r, tar 
Example 38
Project: tdlstm   Author: bluemonk482   File: electionprocessor.py    (license) View Source Project 6 votes vote down vote up
def next_batch(self):
		df = self.batch_df[self.pointer]
		x = np.array([d[0] for d in df])
		xl = np.array([d[1] for d in df])
		xr = np.array([d[2] for d in df])
		tar = np.array([d[3] for d in df])
		y = np.array([d[-1] for d in df])
		# y = pd.get_dummies(y).values.astype(np.int32)
		seq_len = [len(seq) for seq in x]
		seq_len_l = [len(seq) for seq in xl]
		seq_len_r = [len(seq) for seq in xr]
		if self.dynamic_padding:
			x = np.array(self.pad_minibatches(x, 'RIGHT'))
			xl = np.array(self.pad_minibatches(xl, 'RIGHT'))
			xr = np.array(self.pad_minibatches(xr, 'RIGHT'))
		self.pointer += 1
		return x, y, seq_len, xl, seq_len_l, xr, seq_len_r, tar 
Example 39
Project: MF_MBS_Default_Risk   Author: bentruitt   File: mf_mrtg_default_model.py    (license) View Source Project 6 votes vote down vote up
def load_data(in_file):
    # read csv file prepared by freddie_data_analysis module
    df = pd.read_csv(in_file)
    # drop unneeded columns
    columns = df.columns.tolist()
    for col in columns:
        if 'Unnamed' in col:
            df.drop(col, axis=1, inplace=True)
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
    df.drop(['published_date'], axis=1, inplace=True)
    # replace nan values with 0
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    # apply get_dummies to particular columns
    df = pd.get_dummies(df, prefix=['state'], columns=['property_state'])
    df = pd.get_dummies(df, prefix=['ss'], columns=['special_servicer'])
    # return prepared dataframe
    return df 
Example 40
Project: face-to-emotion   Author: mhw32   File: utils.py    (license) View Source Project 6 votes vote down vote up
def gen_fer2013_csv(csv_path, reshape_width=48, reshape_height=48):
    data = pd.read_csv(csv_path)
    pixels = data['pixels'].tolist()
    width, height = 48, 48
    faces = []
    for pixel_sequence in pixels:
        face = [int(pixel) for pixel in pixel_sequence.split(' ')]
        face = np.asarray(face).reshape(width, height)
        face = cv2.resize(face.astype('uint8'),
                          (reshape_width, reshape_height))
        faces.append(face.astype('float32'))

    faces = np.asarray(faces)
    faces = np.expand_dims(faces, -1)
    emotions = pd.get_dummies(data['emotion']).as_matrix()

    return faces, emotions 
Example 41
Project: real_estate   Author: cooperoelrichs   File: xy.py    (license) View Source Project 6 votes vote down vote up
def make_x(self, df):
        x_spec = self.get_individualised_x_spec()


        X = df[XY.reduce_tuples(
            [a for a, b in x_spec if b != 'linear_by_categorical']
        )].copy()
        cats = XY.reduce_tuples(
            [a for a, b in x_spec if b == 'categorical' or b == 'ordinal']
        )

        X = self.prep_work(X, x_spec)

        X = pd.get_dummies(
            X, prefix=cats, prefix_sep='_', columns=cats,
            drop_first=False, dummy_na=False
        )

        return X 
Example 42
Project: jdata   Author: learn2Pro   File: xgb_feature.py    (license) View Source Project 6 votes vote down vote up
def get_comments_product_feat(start_date, end_date):
    dump_path = './cache/comments_accumulate_%s_%s.pkl' % (start_date, end_date)
    if os.path.exists(dump_path):
        comments = pickle.load(open(dump_path))
    else:
        comments = pd.read_csv(comment_path)
        comment_date_end = end_date
        comment_date_begin = comment_date[0]
        for date in reversed(comment_date):
            if date < comment_date_end:
                comment_date_begin = date
                break
        comments = comments[(comments.dt >= comment_date_begin) & (comments.dt < comment_date_end)]
        df = pd.get_dummies(comments['comment_num'], prefix='comment_num')
        comments = pd.concat([comments, df], axis=1)  # type: pd.DataFrame
        # del comments['dt']
        # del comments['comment_num']
        comments = comments[
            ['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3',
             'comment_num_4']]
        pickle.dump(comments, open(dump_path, 'w'))
    return comments 
Example 43
Project: jdata   Author: learn2Pro   File: xgb_feature.py    (license) View Source Project 6 votes vote down vote up
def get_accumulate_product_feat(start_date, end_date):
    feature = ['sku_id', 'product_action_1_ratio', 'product_action_2_ratio', 'product_action_3_ratio',
               'product_action_5_ratio', 'product_action_6_ratio']
    dump_path = './cache/product_feat_accumulate_%s_%s.pkl' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pickle.load(open(dump_path))
    else:
        actions = get_actions(start_date, end_date)
        df = pd.get_dummies(actions['type'], prefix='action')
        actions = pd.concat([actions['sku_id'], df], axis=1)
        actions = actions.groupby(['sku_id'], as_index=False).sum()
        actions['product_action_1_ratio'] = actions['action_4'] / actions['action_1']
        actions['product_action_2_ratio'] = actions['action_4'] / actions['action_2']
        actions['product_action_3_ratio'] = actions['action_4'] / actions['action_3']
        actions['product_action_5_ratio'] = actions['action_4'] / actions['action_5']
        actions['product_action_6_ratio'] = actions['action_4'] / actions['action_6']
        actions = actions[feature]
        pickle.dump(actions, open(dump_path, 'w'))
    return actions 
Example 44
Project: JData   Author: Xls1994   File: gen_feat.py    (license) View Source Project 6 votes vote down vote up
def get_basic_user_feat():
    dump_path = './cache/basic_user.csv'
    # one-hot coding age,sex,lv-cd
    if os.path.exists(dump_path):
        # user = pickle.load(open(dump_path))
        user = pd.read_csv(dump_path)
    else:
        user = pd.read_csv(user_path, encoding='gbk')
        user['age'] = user['age'].map(convert_age)  # ?????

        user['user_reg_tm'] = user['user_reg_tm'].map(convert_reg_date)

        age_df = pd.get_dummies(user["age"], prefix="age")

        sex_df = pd.get_dummies(user["sex"], prefix="sex")
        # user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
        user = pd.concat([user[['user_id', 'user_reg_tm', 'user_lv_cd']], age_df, sex_df], axis=1)
        # pickle.dump(user, open(dump_path, 'w'))
        user.to_csv(dump_path, index=False, encoding='utf-8')
    print 'finish get basic user info'
    return user 
Example 45
Project: JData   Author: Xls1994   File: gen_feat.py    (license) View Source Project 6 votes vote down vote up
def get_basic_product_feat():
    dump_path = './cache/basic_product.csv'
    # one-hot coding a1,a2,a3
    if os.path.exists(dump_path):
        # product = pickle.load(open(dump_path))
        product = pd.read_csv(dump_path)
    else:
        product = pd.read_csv(product_path)
        attr1_df = pd.get_dummies(product["a1"], prefix="a1")
        attr2_df = pd.get_dummies(product["a2"], prefix="a2")
        attr3_df = pd.get_dummies(product["a3"], prefix="a3")
        cate_df = pd.get_dummies(product['cate'], prefix='cate')
        brand_df = pd.get_dummies(product['brand'], prefix='brand')
        # product = pd.concat([product[['sku_id','brand']], attr1_df, attr2_df, attr3_df,cate_df], axis=1)
        product = pd.concat([product[['sku_id','brand']], attr1_df, attr2_df, attr3_df, brand_df, cate_df], axis=1)
        # pickle.dump(product, open(dump_path, 'w'))
        product.to_csv(dump_path, index=False)
    print 'finish get basic product info'
    return product 
Example 46
Project: JData   Author: Xls1994   File: gen_feat.py    (license) View Source Project 6 votes vote down vote up
def get_action_feat(start_date, end_date):
    '''
    Action:
    1.????????????
    2.???3.??????4.???5.???6.??
    '''
    dump_path = './cache/action_accumulate_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        # actions = pickle.load(open(dump_path))
        actions = pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)
        # actions = pd.read_csv(action_1_path)
        actions = actions[['user_id', 'sku_id', 'type']]
        df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
        actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
        # ??????????????
        actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
        del actions['type']
        # pickle.dump(actions, open(dump_path, 'w'))
        actions.to_csv(dump_path, index=False)
    print 'finish get action feat'
    return actions 
Example 47
Project: JData   Author: Xls1994   File: gen_feat.py    (license) View Source Project 6 votes vote down vote up
def get_accumulate_brand_feat(start_date, end_date):
    feature = ['brand', 'brand_action_1_ratio', 'brand_action_2_ratio', 'brand_action_3_ratio',
               'brand_action_5_ratio', 'brand_action_6_ratio', 'brand_action_num']
    dump_path = './cache/brand_feat_accumulate_%s_%s.csv' %(start_date,end_date)
    if os._exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date,end_date)
        df = pd.get_dummies(actions['type'],prefix='action')
        actions = pd.concat([actions['brand'],df],axis=1)
        actions = actions.groupby(['brand'],as_index = False).sum()
        actions['brand_action_1_ratio'] = actions['action_4']/actions['action_1']
        actions['brand_action_2_ratio'] = actions['action_4']/actions['action_2']
        actions['brand_action_3_ratio'] = actions['action_4']/actions['action_3']

        actions['brand_action_5_ratio'] = actions['action_4']/actions['action_5']
        actions['brand_action_6_ratio'] = actions['action_4']/actions['action_6']
        actions['brand_action_num'] = actions['action_1'] + actions['action_2'] + actions['action_3'] + actions[
            'action_4'] + actions['action_5'] + actions['action_6']
        actions = actions[feature]
        actions.replace(np.inf, 9999)
        actions.to_csv(dump_path)
    return  actions
    pass 
Example 48
Project: Benchmarks   Author: ECP-CANDLE   File: p1b2.py    (license) View Source Project 6 votes vote down vote up
def load_data(shuffle=True, n_cols=None):
    train_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.train.csv')
    test_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.test.csv')

    usecols = list(range(n_cols)) if n_cols else None

    df_train = pd.read_csv(train_path, engine='c', usecols=usecols)
    df_test = pd.read_csv(test_path, engine='c', usecols=usecols)

    if shuffle:
        df_train = df_train.sample(frac=1, random_state=seed)
        df_test = df_test.sample(frac=1, random_state=seed)

    X_train = df_train.iloc[:, 2:].as_matrix()
    X_test = df_test.iloc[:, 2:].as_matrix()

    y_train = pd.get_dummies(df_train[['cancer_type']]).as_matrix()
    y_test = pd.get_dummies(df_test[['cancer_type']]).as_matrix()

    return (X_train, y_train), (X_test, y_test) 
Example 49
Project: mlbootcamp_5   Author: ivan-filonov   File: features_text_2.py    (license) View Source Project 6 votes vote down vote up
def build(self):
        train, _, test, _ = data.get()
        cset = []
        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns
        for sc in ['height', 'weight', 'ap_hi', 'ap_lo']:
            tc = df[sc].apply(str)
            maxc = tc.apply(len).max()
            for n in range(maxc):
                df['ft_l_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[n])  if n < len(s) else -1)
                df['ft_r_'+sc+'_'+str(n)] = tc.apply(lambda s:ord(s[-n]) if n < len(s) else -1)
                cset.append('ft_l_'+sc+'_'+str(n))
                cset.append('ft_r_'+sc+'_'+str(n))

        df = pd.get_dummies(df, columns=cset).drop(to_drop, axis=1)
        self.train_= df[:ntrain]
        self.test_ = df[ntrain:]
        return self.train_, self.test_, None 
Example 50
Project: mlbootcamp_5   Author: ivan-filonov   File: features_kmeans_1.py    (license) View Source Project 6 votes vote down vote up
def build(self):
        train, y, test, _ = data.get()

        ntrain = len(train)
        df = pd.concat([train, test], axis=0)
        to_drop = df.columns

        dcn = []
        for n in [2, 5, 10, 15, 25]:
            cname = 'kmeans_' + str(n)
            dcn.append(cname)
            df[cname] = cluster.KMeans(n_clusters=n).fit_predict(df)

        df = pd.get_dummies(df, columns=dcn)

        df = df.drop(to_drop, axis=1)
        train = df[:ntrain]
        test = df[ntrain:].copy()

        return train.astype('int32'), test.astype('int32'), None