Python pandas.merge() Examples
The following are 30
code examples of pandas.merge().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: movielens_preprocess.py From striatum with BSD 2-Clause "Simplified" License | 9 votes |
def main(): # read and preprocess the movie data movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python') movie = movie_preprocessing(movie) # read the ratings data and merge it with movie data rating = pd.read_table("ratings.dat", sep="::", names=["user_id", "movie_id", "rating", "timestamp"], engine='python') data = pd.merge(rating, movie, on="movie_id") # extract feature from our data set streaming_batch, user_feature, actions, reward_list = feature_extraction(data) streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False) user_feature.to_csv("user_feature.csv", sep='\t') pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False) reward_list.to_csv("reward_list.csv", sep='\t', index=False) action_context = movie[movie['movie_id'].isin(actions)] action_context.to_csv("action_context.csv", sep='\t', index = False)
Example #2
Source File: test_merge.py From recruit with Apache License 2.0 | 7 votes |
def test_merge_index_singlekey_inner(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) # inner join result = merge(left, right, left_on='key', right_index=True, how='inner') expected = left.join(right, on='key').loc[result.index] assert_frame_equal(result, expected) result = merge(right, left, right_on='key', left_index=True, how='inner') expected = left.join(right, on='key').loc[result.index] assert_frame_equal(result, expected.loc[:, result.columns])
Example #3
Source File: test_merge.py From recruit with Apache License 2.0 | 7 votes |
def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 df1 = pd.DataFrame(pd.date_range( '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'), columns=['date']) df1['value'] = 1 df2 = pd.DataFrame({ 'date': pd.to_datetime([ '2017-10-29 03:00:00', '2017-10-29 04:00:00', '2017-10-29 05:00:00' ]), 'value': 2 }) df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert( 'Europe/Madrid') result = pd.merge(df1, df2, how='outer', on='date') expected = pd.DataFrame({ 'date': pd.date_range( '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'), 'value_x': [1] * 4 + [np.nan] * 3, 'value_y': [np.nan] * 4 + [2] * 3 }) assert_frame_equal(result, expected)
Example #4
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_on_datetime64tz_empty(self): # https://github.com/pandas-dev/pandas/issues/25014 dtz = pd.DatetimeTZDtype(tz='UTC') right = pd.DataFrame({'date': [pd.Timestamp('2018', tz=dtz.tz)], 'value': [4.0], 'date2': [pd.Timestamp('2019', tz=dtz.tz)]}, columns=['date', 'value', 'date2']) left = right[:0] result = left.merge(right, on='date') expected = pd.DataFrame({ 'value_x': pd.Series(dtype=float), 'date2_x': pd.Series(dtype=dtz), 'date': pd.Series(dtype=dtz), 'value_y': pd.Series(dtype=float), 'date2_y': pd.Series(dtype=dtz), }, columns=['value_x', 'date2_x', 'date', 'value_y', 'date2_y']) tm.assert_frame_equal(result, expected)
Example #5
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_incompat_infer_boolean_object(self): # GH21119: bool + object bool merge OK df1 = DataFrame({'key': Series([True, False], dtype=object)}) df2 = DataFrame({'key': [True, False]}) expected = DataFrame({'key': [True, False]}, dtype=object) result = pd.merge(df1, df2, on='key') assert_frame_equal(result, expected) result = pd.merge(df2, df1, on='key') assert_frame_equal(result, expected) # with missing value df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)}) df2 = DataFrame({'key': [True, False]}) expected = DataFrame({'key': [True, False]}, dtype=object) result = pd.merge(df1, df2, on='key') assert_frame_equal(result, expected) result = pd.merge(df2, df1, on='key') assert_frame_equal(result, expected)
Example #6
Source File: preprocess_for_nn.py From mercari-price-suggestion with MIT License | 6 votes |
def transform(self, df): df["name"] = df["name"].apply(number_preprocess) df["item_description"] = df["item_description"].apply(number_preprocess) for cat in self.cat_cols: df[cat] = df[cat].apply(lambda x: x if x in self.cat_vocab[cat] else "rarecategory") df[cat] = self.le[cat].transform(df[cat]) df["name"] = df["name"].apply(lambda x: cut(x, self.voc)) df["item_description"] = df["item_description"].apply(lambda x: cut(x, self.voc)) df['seq_item_description'] = self.tok_raw.texts_to_sequences(df["item_description"].values) df['seq_name'] = self.tok_raw.texts_to_sequences(df["name"].values) for col in ["name_ori", "item_description_ori"]: f_col = col + "_freq" df = pd.merge(df, self.freqs[col], how="left", on=col) df[f_col] = df[f_col].fillna(0) df[f_col] = df[f_col] / (self.max_freqs[col] + 1) return df
Example #7
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_on_index_with_more_values(self, how): # GH 24212 # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is # interpreted as a missing value instead of the last element df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], columns=['a', 'b']) df2 = pd.DataFrame([[3, 30], [4, 40]], columns=['a', 'c']) df1.set_index('a', drop=False, inplace=True) df2.set_index('a', inplace=True) result = pd.merge(df1, df2, left_index=True, right_on='a', how=how) expected = pd.DataFrame([[1, 2, np.nan], [2, 4, np.nan], [3, 6, 30.0], [4, 8, 40.0]], columns=['a', 'b', 'c']) expected.set_index('a', drop=False, inplace=True) assert_frame_equal(result, expected)
Example #8
Source File: pricing.py From thewarden with MIT License | 6 votes |
def df_fx(self, currency, fx_provider): try: # First get the df from this currency if currency != 'USD': fx = PriceData(currency, fx_provider) fx.df = fx.df.rename(columns={'close': 'fx_close'}) fx.df["fx_close"] = pd.to_numeric(fx.df.fx_close, errors='coerce') # Merge the two dfs: merge_df = pd.merge(self.df, fx.df, on='date', how='inner') merge_df['close'] = merge_df['close'].astype(float) merge_df['close_converted'] = merge_df['close'] * merge_df[ 'fx_close'] return (merge_df) else: # If currency is USD no conversion is needed - prices are all in USD self.df['fx_close'] = 1 self.df['close_converted'] = self.df['close'].astype(float) return (self.df) except Exception as e: self.errors.append(e) return (None)
Example #9
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): # GH 9780, GH 15800 # Raise a ValueError when a user tries to merge on # dtypes that are incompatible (e.g., obj and int/float) df1 = DataFrame({'A': df1_vals}) df2 = DataFrame({'A': df2_vals}) msg = ("You are trying to merge on {lk_dtype} and " "{rk_dtype} columns. If you wish to proceed " "you should use pd.concat".format(lk_dtype=df1['A'].dtype, rk_dtype=df2['A'].dtype)) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): pd.merge(df1, df2, on=['A']) # Check that error still raised when swapping order of dataframes msg = ("You are trying to merge on {lk_dtype} and " "{rk_dtype} columns. If you wish to proceed " "you should use pd.concat".format(lk_dtype=df2['A'].dtype, rk_dtype=df1['A'].dtype)) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): pd.merge(df2, df1, on=['A'])
Example #10
Source File: reshaping.py From recruit with Apache License 2.0 | 6 votes |
def test_merge(self, data, na_value): # GH-20743 df1 = pd.DataFrame({'ext': data[:3], 'int1': [1, 2, 3], 'key': [0, 1, 2]}) df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) res = pd.merge(df1, df2) exp = pd.DataFrame( {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], 'ext': data._from_sequence([data[0], data[0], data[1]], dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) res = pd.merge(df1, df2, how='outer') exp = pd.DataFrame( {'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4], 'key': [0, 0, 1, 2, 3], 'ext': data._from_sequence( [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
Example #11
Source File: reshaping.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_on_extension_array(self, data): # GH 23020 a, b = data[:2] key = type(data)._from_sequence([a, b], dtype=data.dtype) df = pd.DataFrame({"key": key, "val": [1, 2]}) result = pd.merge(df, df, on='key') expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]}) self.assert_frame_equal(result, expected) # order result = pd.merge(df.iloc[[1, 0]], df, on='key') expected = expected.iloc[[1, 0]].reset_index(drop=True) self.assert_frame_equal(result, expected)
Example #12
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_other_timedelta_unit(self, unit): # GH 13389 df1 = pd.DataFrame({'entity_id': [101, 102]}) s = pd.Series([None, None], index=[101, 102], name='days') dtype = "m8[{}]".format(unit) df2 = s.astype(dtype).to_frame('days') assert df2['days'].dtype == 'm8[ns]' result = df1.merge(df2, left_on='entity_id', right_index=True) exp = pd.DataFrame({'entity_id': [101, 102], 'days': np.array(['nat', 'nat'], dtype=dtype)}, columns=['entity_id', 'days']) tm.assert_frame_equal(result, exp)
Example #13
Source File: test_multi.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_on_multikey(self, left, right, join_type): on_cols = ['key1', 'key2'] result = (left.join(right, on=on_cols, how=join_type) .reset_index(drop=True)) expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type) tm.assert_frame_equal(result, expected) result = (left.join(right, on=on_cols, how=join_type, sort=True) .reset_index(drop=True)) expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type, sort=True) tm.assert_frame_equal(result, expected)
Example #14
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_other_datetime_unit(self): # GH 13389 df1 = pd.DataFrame({'entity_id': [101, 102]}) s = pd.Series([None, None], index=[101, 102], name='days') for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: df2 = s.astype(dtype).to_frame('days') # coerces to datetime64[ns], thus sholuld not be affected assert df2['days'].dtype == 'datetime64[ns]' result = df1.merge(df2, left_on='entity_id', right_index=True) exp = pd.DataFrame({'entity_id': [101, 102], 'days': np.array(['nat', 'nat'], dtype='datetime64[ns]')}, columns=['entity_id', 'days']) tm.assert_frame_equal(result, exp)
Example #15
Source File: test_multi.py From recruit with Apache License 2.0 | 6 votes |
def test_single_common_level(self): index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), ('K1', 'X2')], names=['key', 'X']) left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=index_left) index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), ('K2', 'Y2'), ('K2', 'Y3')], names=['key', 'Y']) right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}, index=index_right) result = left.join(right) expected = (pd.merge(left.reset_index(), right.reset_index(), on=['key'], how='inner') .set_index(['key', 'X', 'Y'])) tm.assert_frame_equal(result, expected)
Example #16
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_join_key_dtype_cast(self): # #8596 df1 = DataFrame({'key': [1], 'v1': [10]}) df2 = DataFrame({'key': [2], 'v1': [20]}) df = merge(df1, df2, how='outer') assert df['key'].dtype == 'int64' df1 = DataFrame({'key': [True], 'v1': [1]}) df2 = DataFrame({'key': [False], 'v1': [0]}) df = merge(df1, df2, how='outer') # GH13169 # this really should be bool assert df['key'].dtype == 'object' df1 = DataFrame({'val': [1]}) df2 = DataFrame({'val': [2]}) lkey = np.array([1]) rkey = np.array([2]) df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') assert df['key_0'].dtype == 'int64'
Example #17
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_misspecified(self): msg = "Must pass right_on or right_index=True" with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.right, left_index=True) msg = "Must pass left_on or left_index=True" with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.right, right_index=True) msg = ('Can only pass argument "on" OR "left_on" and "right_on", not' ' a combination of both') with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.left, left_on='key', on='key') msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): merge(self.df, self.df2, left_on=['key1'], right_on=['key1', 'key2'])
Example #18
Source File: test_merge.py From recruit with Apache License 2.0 | 6 votes |
def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) merged1 = merge(left, right, left_on='key', right_index=True, how='left', sort=False) merged2 = merge(right, left, right_on='key', left_index=True, how='right', sort=False) assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) merged1 = merge(left, right, left_on='key', right_index=True, how='left', sort=True) merged2 = merge(right, left, right_on='key', left_index=True, how='right', sort=True) assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
Example #19
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_left_merge_empty_dataframe(self): left = DataFrame({'key': [1], 'value': [2]}) right = DataFrame({'key': []}) result = merge(left, right, on='key', how='left') assert_frame_equal(result, left) result = merge(right, left, on='key', how='right') assert_frame_equal(result, left)
Example #20
Source File: test_join.py From recruit with Apache License 2.0 | 5 votes |
def test_join_inner_multiindex(self): key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap'] key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one'] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index assert joined.index.is_monotonic assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns])
Example #21
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_index_and_on_parameters_confusion(self): msg = ("right_index parameter must be of type bool, not" r" <(class|type) 'list'>") with pytest.raises(ValueError, match=msg): merge(self.df, self.df2, how='left', left_index=False, right_index=['key1', 'key2']) msg = ("left_index parameter must be of type bool, not " r"<(class|type) 'list'>") with pytest.raises(ValueError, match=msg): merge(self.df, self.df2, how='left', left_index=['key1', 'key2'], right_index=False) with pytest.raises(ValueError, match=msg): merge(self.df, self.df2, how='left', left_index=['key1', 'key2'], right_index=['key1', 'key2'])
Example #22
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_merge_inner_join_empty(self): # GH 15328 df_empty = pd.DataFrame() df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') result = pd.merge(df_empty, df_a, left_index=True, right_index=True) expected = pd.DataFrame({'a': []}, index=[], dtype='int64') assert_frame_equal(result, expected)
Example #23
Source File: test_join.py From recruit with Apache License 2.0 | 5 votes |
def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product([list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index() .merge(right.reset_index(), on=['abc', 'xy'], how=join_type) .set_index(['abc', 'xy', 'num']) ) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type)
Example #24
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_merge_common(self): joined = merge(self.df, self.df2) exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp)
Example #25
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_merge_overlap(self): merged = merge(self.left, self.left, on='key') exp_len = (self.left['key'].value_counts() ** 2).sum() assert len(merged) == exp_len assert 'v1_x' in merged assert 'v1_y' in merged
Example #26
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_merge_copy(self): left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) merged = merge(left, right, left_index=True, right_index=True, copy=True) merged['a'] = 6 assert (left['a'] == 0).all() merged['d'] = 'peekaboo' assert (right['d'] == 'bar').all()
Example #27
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_merge_nocopy(self): left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) merged['a'] = 6 assert (left['a'] == 6).all() merged['d'] = 'peekaboo' assert (right['d'] == 'peekaboo').all()
Example #28
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame left = DataFrame({'key': [1, 1, 2, 2, 3], 'value': lrange(5)}, columns=['value', 'key']) right = DataFrame({'key': [1, 1, 2, 3, 4, 5], 'rvalue': lrange(6)}) joined = merge(left, right, on='key', how='outer') expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5], 'value': np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]}, columns=['value', 'key', 'rvalue']) assert_frame_equal(joined, expected)
Example #29
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): # GH 16572 # Check that float column is not cast to object if # merging on float and int columns A = DataFrame({'X': int_vals}) B = DataFrame({'Y': float_vals}) expected = DataFrame(exp_vals) result = A.merge(B, left_on='X', right_on='Y') assert_frame_equal(result, expected) result = B.merge(A, left_on='Y', right_on='X') assert_frame_equal(result, expected[['Y', 'X']])
Example #30
Source File: test_merge.py From recruit with Apache License 2.0 | 5 votes |
def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): # these are explicity allowed incompat merges, that pass thru # the result type is dependent on if the values on the rhs are # inferred, otherwise these will be coereced to object df1 = DataFrame({'A': df1_vals}) df2 = DataFrame({'A': df2_vals}) result = pd.merge(df1, df2, on=['A']) assert is_object_dtype(result.A.dtype) result = pd.merge(df2, df1, on=['A']) assert is_object_dtype(result.A.dtype)