Python pandas.get_dummies() Examples

The following are 30 code examples of pandas.get_dummies(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_int_df(self, dtype):
        data = DataFrame(
            {'A': [1, 2, 1],
             'B': pd.Categorical(['a', 'b', 'a']),
             'C': [1, 2, 1],
             'D': [1., 2., 1.]
             }
        )
        columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
        expected = DataFrame([
            [1, 1., 1, 0, 1, 0],
            [2, 2., 0, 1, 0, 1],
            [1, 1., 1, 0, 1, 0]
        ], columns=columns)
        expected[columns[2:]] = expected[columns[2:]].astype(dtype)
        result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
        tm.assert_frame_equal(result, expected) 
Example #2
Source File: BPNN_Classify_Data.py    From Machine-Learning-for-Beginner-by-Python3 with MIT License 6 votes vote down vote up
def handle_data(filepath, miss='fill'):  # 定义处理数据的函数
    data = pd.read_csv(r'%s'%filepath)
    data = data.replace('?', np.nan)
    #  处理缺失值
    if miss == 'del':  # 删除掉缺失值
        miss_data = data.dropna(how='any')
    else:
        miss_data = data.fillna(method='ffill')
    #  新建DataFrame
    newdata = pd.DataFrame()
    #  独热化编码
    for ikey in miss_data:
        if miss_data[ikey].dtype == 'object':  # 独热编码
            onedata = pd.get_dummies(miss_data[ikey])
            newdata = pd.concat([newdata, onedata], axis=1)
        else:
            newdata[ikey] = miss_data[ikey]
    return newdata 
Example #3
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
        df['cat'] = pd.Categorical(['x', 'y', 'y'])
        result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
        if sparse:
            arr = SparseArray
            typ = SparseDtype(dtype, 0)
        else:
            arr = np.array
            typ = dtype

        expected = DataFrame({'C': [1, 2, 3],
                              'A_a': arr([1, 0, 1], dtype=typ),
                              'A_b': arr([0, 1, 0], dtype=typ),
                              'B_b': arr([1, 1, 0], dtype=typ),
                              'B_c': arr([0, 0, 1], dtype=typ),
                              'cat_x': arr([1, 0, 0], dtype=typ),
                              'cat_y': arr([0, 1, 1], dtype=typ)
                              }).sort_index(axis=1)

        assert_frame_equal(result, expected) 
Example #4
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_dataframe_dummies_prefix_str(self, df, sparse):
        # not that you should do this...
        result = get_dummies(df, prefix='bad', sparse=sparse)
        bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
        expected = DataFrame([[1, 1, 0, 1, 0],
                              [2, 0, 1, 1, 0],
                              [3, 1, 0, 0, 1]],
                             columns=['C'] + bad_columns,
                             dtype=np.uint8)
        expected = expected.astype({"C": np.int64})
        if sparse:
            # work around astyping & assigning with duplicate columns
            # https://github.com/pandas-dev/pandas/issues/14427
            expected = pd.concat([
                pd.Series([1, 2, 3], name='C'),
                pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'),
                pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
                pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'),
                pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'),
            ], axis=1)

        assert_frame_equal(result, expected) 
Example #5
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_dataframe_dummies_prefix_list(self, df, sparse):
        prefixes = ['from_A', 'from_B']
        result = get_dummies(df, prefix=prefixes, sparse=sparse)
        expected = DataFrame({'C': [1, 2, 3],
                              'from_A_a': [1, 0, 1],
                              'from_A_b': [0, 1, 0],
                              'from_B_b': [1, 1, 0],
                              'from_B_c': [0, 0, 1]},
                             dtype=np.uint8)
        expected[['C']] = df[['C']]
        cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
        expected = expected[['C'] + cols]

        typ = pd.SparseArray if sparse else pd.Series
        expected[cols] = expected[cols].apply(lambda x: typ(x))
        assert_frame_equal(result, expected) 
Example #6
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_just_na(self, sparse):
        just_na_list = [np.nan]
        just_na_series = Series(just_na_list)
        just_na_series_index = Series(just_na_list, index=['A'])

        res_list = get_dummies(just_na_list, sparse=sparse)
        res_series = get_dummies(just_na_series, sparse=sparse)
        res_series_index = get_dummies(just_na_series_index, sparse=sparse)

        assert res_list.empty
        assert res_series.empty
        assert res_series_index.empty

        assert res_list.index.tolist() == [0]
        assert res_series.index.tolist() == [0]
        assert res_series_index.index.tolist() == ['A'] 
Example #7
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_dataframe_dummies_all_obj(self, df, sparse):
        df = df[['A', 'B']]
        result = get_dummies(df, sparse=sparse)
        expected = DataFrame({'A_a': [1, 0, 1],
                              'A_b': [0, 1, 0],
                              'B_b': [1, 1, 0],
                              'B_c': [0, 0, 1]},
                             dtype=np.uint8)
        if sparse:
            expected = pd.DataFrame({
                "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
                "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
                "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
                "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
            })

        assert_frame_equal(result, expected) 
Example #8
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_dataframe_dummies_prefix_sep(self, df, sparse):
        result = get_dummies(df, prefix_sep='..', sparse=sparse)
        expected = DataFrame({'C': [1, 2, 3],
                              'A..a': [1, 0, 1],
                              'A..b': [0, 1, 0],
                              'B..b': [1, 1, 0],
                              'B..c': [0, 0, 1]},
                             dtype=np.uint8)
        expected[['C']] = df[['C']]
        expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
        assert_frame_equal(result, expected)

        result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
        expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
        assert_frame_equal(result, expected)

        result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
                             sparse=sparse)
        assert_frame_equal(result, expected) 
Example #9
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_just_na(self, sparse):
        just_na_list = [np.nan]
        just_na_series = Series(just_na_list)
        just_na_series_index = Series(just_na_list, index=['A'])

        res_list = get_dummies(just_na_list, sparse=sparse)
        res_series = get_dummies(just_na_series, sparse=sparse)
        res_series_index = get_dummies(just_na_series_index, sparse=sparse)

        assert res_list.empty
        assert res_series.empty
        assert res_series_index.empty

        assert res_list.index.tolist() == [0]
        assert res_series.index.tolist() == [0]
        assert res_series_index.index.tolist() == ['A'] 
Example #10
Source File: titanic.py    From MachineLearning with Apache License 2.0 6 votes vote down vote up
def process_ticket(combined):
    # a function that extracts each prefix of the ticket,
    # returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip() , ticket)
        ticket = filter(lambda t : not t.isdigit(), ticket)
        if len(ticket) > 0:
            return ticket[0]
        else:
            return 'XXX'

    # Extracting dummy variables from tickets:
    combined['Ticket'] = combined['Ticket'].map(cleanTicket)
    tickets_dummies = pd.get_dummies(combined['Ticket'],prefix='Ticket')
    combined = pd.concat([combined, tickets_dummies],axis=1)
    combined.drop('Ticket',inplace=True,axis=1)
    return combined 
Example #11
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_basic(self, sparse, dtype):
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=self.effective_dtype(dtype))
        if sparse:
            expected = expected.apply(pd.SparseArray, fill_value=0.0)
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        assert_frame_equal(result, expected) 
Example #12
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_basic_drop_first_one_level(self, sparse):
        # Test the case that categorical variable only has one level.
        s_list = list('aaa')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame(index=np.arange(3))

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected = DataFrame(index=list('ABC'))
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected) 
Example #13
Source File: SVM_Classify_Data.py    From Machine-Learning-for-Beginner-by-Python3 with MIT License 6 votes vote down vote up
def trans(exdata, nor=normal, oh=one_hot, bin=binary):
    keylist = exdata.keys()
    newexdata = pd.DataFrame()
    for ikey in range(len(keylist)):
        if ikey + 1 in nor:
            newexdata[keylist[ikey]] = (exdata[keylist[ikey]] - exdata[keylist[ikey]].mean()) / exdata[keylist[ikey]].std()
        elif ikey + 1 in bin:
            newexdata[keylist[ikey]] = [1 if inum == 1 else -1 for inum in exdata[keylist[ikey]]]
        elif ikey + 1 in oh:
            newdata = pd.get_dummies(exdata[keylist[ikey]], prefix=keylist[ikey])
            newexdata = pd.concat([newexdata,newdata], axis=1)
    return newexdata


# 类别说明
# Absence (1) 1类
# presence (2) -1类

#  将训练数据平均分为n份,利用K折交叉验证计算模型最终的正确率
#  将训练数据分为训练数据和验证数据 
Example #14
Source File: discrete_model.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def _pandas_to_dummies(endog):
    if endog.ndim == 2:
        if endog.shape[1] == 1:
            yname = endog.columns[0]
            endog_dummies = get_dummies(endog.iloc[:, 0])
        else:  # series
            yname = 'y'
            endog_dummies = endog
    else:
        yname = endog.name
        endog_dummies = get_dummies(endog)
    ynames = endog_dummies.columns.tolist()

    return endog_dummies, ynames, yname


#### Private Model Classes #### 
Example #15
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_int_int(self):
        data = Series([1, 2, 1])
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=[1, 2],
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected)

        data = Series(pd.Categorical(['a', 'b', 'a']))
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=pd.Categorical(['a', 'b']),
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected) 
Example #16
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_basic_drop_first_NA(self, sparse):
        # Test NA handling together with drop_first
        s_NA = ['a', 'b', np.nan]
        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
        exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0)

        assert_frame_equal(res, exp)

        res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
                             sparse=sparse)
        exp_na = DataFrame(
            {'b': [0, 1, 0],
             nan: [0, 0, 1]},
            dtype=np.uint8).reindex(['b', nan], axis=1)
        if sparse:
            exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
                                  sparse=sparse)
        exp_just_na = DataFrame(index=np.arange(1))
        assert_frame_equal(res_just_na, exp_just_na) 
Example #17
Source File: test_reshape.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
        # GH13854
        for ordered in [False, True]:
            cat = pd.Categorical(list("xy"), categories=list("xyz"),
                                 ordered=ordered)
            result = get_dummies(cat, dtype=dtype)

            data = np.array([[1, 0, 0], [0, 1, 0]],
                            dtype=self.effective_dtype(dtype))
            cols = pd.CategoricalIndex(cat.categories,
                                       categories=cat.categories,
                                       ordered=ordered)
            expected = DataFrame(data, columns=cols,
                                 dtype=self.effective_dtype(dtype))

            tm.assert_frame_equal(result, expected) 
Example #18
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_dataframe_dummies_prefix_dict(self, sparse):
        prefixes = {'A': 'from_A', 'B': 'from_B'}
        df = DataFrame({'C': [1, 2, 3],
                        'A': ['a', 'b', 'a'],
                        'B': ['b', 'b', 'c']})
        result = get_dummies(df, prefix=prefixes, sparse=sparse)

        expected = DataFrame({'C': [1, 2, 3],
                              'from_A_a': [1, 0, 1],
                              'from_A_b': [0, 1, 0],
                              'from_B_b': [1, 1, 0],
                              'from_B_c': [0, 0, 1]})

        columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
        expected[columns] = expected[columns].astype(np.uint8)
        assert_frame_equal(result, expected) 
Example #19
Source File: cbc_hb.py    From lifestyles with MIT License 6 votes vote down vote up
def model(profiles, comparisons, selections, sample=2500, alpha_prior_std=10):
    all_attributes = pd.get_dummies(profiles).columns
    profiles_dummies = pd.get_dummies(profiles, drop_first=True)
    choices = pd.concat({profile: profiles_dummies.loc[comparisons[profile]].reset_index(drop=True) for profile in comparisons.columns}, axis=1)

    respondants = selections.columns
    n_attributes_in_model = profiles_dummies.shape[1]
    n_participants = selections.shape[1]

    with pm.Model():

        # https://www.sawtoothsoftware.com/download/ssiweb/CBCHB_Manual.pdf
        # need to include the covariance matrix as a parent of `partsworth`
        alpha = pm.Normal('alpha', 0, sd=alpha_prior_std, shape=n_attributes_in_model, testval=np.random.randn(n_attributes_in_model))
        partsworth = pm.MvNormal("partsworth", alpha, tau=np.eye(n_attributes_in_model), shape=(n_participants, n_attributes_in_model))

        cs = [_create_observation_variable(selection, choices, partsworth[i, :]) for i, (_, selection) in enumerate(selections.iteritems())]

        trace = pm.sample(sample)
    return transform_trace_to_individual_summary_statistics(trace, respondants, profiles_dummies.columns, all_attributes) 
Example #20
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
        # GH13854
        for ordered in [False, True]:
            cat = pd.Categorical(list("xy"), categories=list("xyz"),
                                 ordered=ordered)
            result = get_dummies(cat, dtype=dtype)

            data = np.array([[1, 0, 0], [0, 1, 0]],
                            dtype=self.effective_dtype(dtype))
            cols = pd.CategoricalIndex(cat.categories,
                                       categories=cat.categories,
                                       ordered=ordered)
            expected = DataFrame(data, columns=cols,
                                 dtype=self.effective_dtype(dtype))

            tm.assert_frame_equal(result, expected) 
Example #21
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_int_df(self, dtype):
        data = DataFrame(
            {'A': [1, 2, 1],
             'B': pd.Categorical(['a', 'b', 'a']),
             'C': [1, 2, 1],
             'D': [1., 2., 1.]
             }
        )
        columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
        expected = DataFrame([
            [1, 1., 1, 0, 1, 0],
            [2, 2., 0, 1, 0, 1],
            [1, 1., 1, 0, 1, 0]
        ], columns=columns)
        expected[columns[2:]] = expected[columns[2:]].astype(dtype)
        result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
        tm.assert_frame_equal(result, expected) 
Example #22
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_int_int(self):
        data = Series([1, 2, 1])
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=[1, 2],
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected)

        data = Series(pd.Categorical(['a', 'b', 'a']))
        result = pd.get_dummies(data)
        expected = DataFrame([[1, 0],
                              [0, 1],
                              [1, 0]],
                             columns=pd.Categorical(['a', 'b']),
                             dtype=np.uint8)
        tm.assert_frame_equal(result, expected) 
Example #23
Source File: app.py    From demo-self-driving with Apache License 2.0 6 votes vote down vote up
def run_the_app():
    # To make Streamlit fast, st.cache allows us to reuse computation across runs.
    # In this common pattern, we download data from an endpoint only once.
    @st.cache
    def load_metadata(url):
        return pd.read_csv(url)

    # This function uses some Pandas magic to summarize the metadata Dataframe.
    @st.cache
    def create_summary(metadata):
        one_hot_encoded = pd.get_dummies(metadata[["frame", "label"]], columns=["label"])
        summary = one_hot_encoded.groupby(["frame"]).sum().rename(columns={
            "label_biker": "biker",
            "label_car": "car",
            "label_pedestrian": "pedestrian",
            "label_trafficLight": "traffic light",
            "label_truck": "truck"
        })
        return summary

    # An amazing property of st.cached functions is that you can pipe them into
    # one another to form a computation DAG (directed acyclic graph). Streamlit
    # recomputes only whatever subset is required to get the right answer!
    metadata = load_metadata(os.path.join(DATA_URL_ROOT, "labels.csv.gz"))
    summary = create_summary(metadata)

    # Uncomment these lines to peek at these DataFrames.
    # st.write('## Metadata', metadata[:1000], '## Summary', summary[:1000])

    # Draw the UI elements to search for objects (pedestrians, cars, etc.)
    selected_frame_index, selected_frame = frame_selector_ui(summary)
    if selected_frame_index == None:
        st.error("No frames fit the criteria. Please select different label or number.")
        return

    # Draw the UI element to select parameters for the YOLO object detector.
    confidence_threshold, overlap_threshold = object_detector_ui()

    # Load the image from S3.
    image_url = os.path.join(DATA_URL_ROOT, selected_frame)
    image = load_image(image_url)

    # Add boxes for objects on the image. These are the boxes for the ground image.
    boxes = metadata[metadata.frame == selected_frame].drop(columns=["frame"])
    draw_image_with_boxes(image, boxes, "Ground Truth",
        "**Human-annotated data** (frame `%i`)" % selected_frame_index)

    # Get the boxes for the objects detected by YOLO by running the YOLO model.
    yolo_boxes = yolo_v3(image, confidence_threshold, overlap_threshold)
    draw_image_with_boxes(image, yolo_boxes, "Real-time Computer Vision",
        "**YOLO v3 Model** (overlap `%3.1f`) (confidence `%3.1f`)" % (overlap_threshold, confidence_threshold))

# This sidebar UI is a little search engine to find certain object types. 
Example #24
Source File: EDA.py    From G-Bert with MIT License 6 votes vote down vote up
def process_side():
    print('process_side')

    side_pd = pd.read_csv(patient_info_file)
    # just use demographic information to avoid future information leak such as lab test and lab measurements
    side_pd = side_pd[['subject_id', 'hadm_id', 'icustay_id',
                       'gender_male', 'admission_type', 'first_icu_stay', 'admission_age',
                       'ethnicity', 'weight', 'height']]

    # process side_information
    side_pd = side_pd.dropna(thresh=4)
    side_pd.fillna(side_pd.mean(), inplace=True)
    side_pd = side_pd.groupby(by=['subject_id', 'hadm_id']).head(
        [1]).reset_index(drop=True)
    side_pd = pd.concat(
        [side_pd, pd.get_dummies(side_pd['ethnicity'])], axis=1)
    side_pd.drop(columns=['ethnicity', 'icustay_id'], inplace=True)
    side_pd.rename(columns={'subject_id': 'SUBJECT_ID',
                            'hadm_id': 'HADM_ID'}, inplace=True)
    return side_pd.reset_index(drop=True) 
Example #25
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
        df.loc[3, :] = [np.nan, np.nan, np.nan]
        result = get_dummies(df, dummy_na=True,
                             sparse=sparse, dtype=dtype).sort_index(axis=1)
        expected = DataFrame({'C': [1, 2, 3, np.nan],
                              'A_a': [1, 0, 1, 0],
                              'A_b': [0, 1, 0, 0],
                              'A_nan': [0, 0, 0, 1],
                              'B_b': [1, 1, 0, 0],
                              'B_c': [0, 0, 1, 0],
                              'B_nan': [0, 0, 0, 1]}).sort_index(axis=1)

        e_dtype = self.effective_dtype(dtype)
        columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
        expected[columns] = expected[columns].astype(e_dtype)
        assert_frame_equal(result, expected)

        result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
        expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
        assert_frame_equal(result, expected) 
Example #26
Source File: multi_input.py    From aboleth with Apache License 2.0 6 votes vote down vote up
def input_fn(df):
    """Format the downloaded data."""
    # Creates a dictionary mapping from each continuous feature column name (k)
    # to the values of that column stored in a constant Tensor.
    continuous_cols = [df[k].values for k in CONTINUOUS_COLUMNS]
    X_con = np.stack(continuous_cols).astype(np.float32).T

    # Standardise
    X_con -= X_con.mean(axis=0)
    X_con /= X_con.std(axis=0)

    # Creates a dictionary mapping from each categorical feature column name
    categ_cols = [np.where(pd.get_dummies(df[k]).values)[1][:, np.newaxis]
                  for k in CATEGORICAL_COLUMNS]
    n_values = [np.amax(c) + 1 for c in categ_cols]
    X_cat = np.concatenate(categ_cols, axis=1).astype(np.int32)

    # Converts the label column into a constant Tensor.
    label = df[LABEL_COLUMN].values[:, np.newaxis]

    # Returns the feature columns and the label.
    return X_con, X_cat, n_values, label 
Example #27
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_basic_drop_first(self, sparse):
        # GH12402 Add a new parameter `drop_first` to avoid collinearity
        # Basic case
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=np.uint8)

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected) 
Example #28
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_basic_drop_first_NA(self, sparse):
        # Test NA handling together with drop_first
        s_NA = ['a', 'b', np.nan]
        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
        exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
        assert_frame_equal(res, exp)

        res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
                             sparse=sparse)
        exp_na = DataFrame(
            {'b': [0, 1, 0],
             nan: [0, 0, 1]},
            dtype=np.uint8).reindex(['b', nan], axis=1)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
                                  sparse=sparse)
        exp_just_na = DataFrame(index=np.arange(1))
        assert_frame_equal(res_just_na, exp_just_na) 
Example #29
Source File: model.py    From cloudml-samples with Apache License 2.0 6 votes vote down vote up
def generator_input(filenames, chunk_size, batch_size=64):
    """Produce features and labels needed by keras fit_generator."""

    feature_cols = None
    while True:
        input_reader = pd.read_csv(
            tf.gfile.Open(filenames[0]),
            names=CSV_COLUMNS,
            chunksize=chunk_size,
            na_values=' ?')

        for input_data in input_reader:
            input_data = input_data.dropna()
            label = pd.get_dummies(input_data.pop(LABEL_COLUMN))

            input_data = to_numeric_features(input_data, feature_cols)

            # Retains schema for next chunk processing.
            if feature_cols is None:
                feature_cols = input_data.columns

            idx_len = input_data.shape[0]
            for index in range(0, idx_len, batch_size):
                yield (input_data.iloc[index:min(idx_len, index + batch_size)],
                       label.iloc[index:min(idx_len, index + batch_size)]) 
Example #30
Source File: test_reshape.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_basic_drop_first_one_level(self, sparse):
        # Test the case that categorical variable only has one level.
        s_list = list('aaa')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame(index=np.arange(3))

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)

        expected = DataFrame(index=list('ABC'))
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        assert_frame_equal(result, expected)