1、分级索引
data = pd.Series(np.random.randn(9), index=[[‘a‘, ‘a‘, ‘a‘, ‘b‘, ‘b‘, ‘c‘, ‘c‘, ‘d‘, ‘d‘], [1,2,3,1,3,1,2,2,3]]) data.index data[‘b‘] data[‘b‘:‘d‘] data.loc[:, 2] data.loc[:, 1] data.unstack() data.unstack().stack() frame = pd.DataFrame(np.arange(12).reshape((4, 3)), index=[[‘a‘, ‘a‘, ‘b‘, ‘b‘], [1, 2, 1, 2]], columns=[[‘Ohio‘, ‘Ohio‘, ‘Colorado‘], [‘Green‘, ‘Red‘, ‘Green‘]]) frame.index.names = [‘key1‘, ‘key2‘] frame.columns.names = [‘state‘, ‘color‘] frame[‘Ohio‘] frame.swaplevel(‘key1‘,‘key2‘) #交换level frame.sort_index(level=1) #交换level frame.swaplevel(0, 1).sort_index(level=0) #Summary Statistics by Level frame.sum(level=‘key1‘) frame.sum(level=‘color‘, axis=1) #Indexing with a DataFrame’s columns列作为索引 frame = pd.DataFrame({‘a‘: range(7), ‘b‘: range(7, 0, -1), ‘c‘: [‘one‘, ‘one‘, ‘one‘, ‘two‘, ‘two‘, ‘two‘, ‘two‘], ‘d‘:[0,1,2,0,1,2,3]}) frame2 = frame.set_index([‘c‘, ‘d‘]) frame.set_index([‘c‘, ‘d‘], drop=False) frame2.reset_index()
2、合并数据集
df1= pd.DataFrame({‘key‘: [‘b‘, ‘b‘, ‘a‘, ‘c‘, ‘a‘, ‘a‘, ‘b‘], ‘data1‘: range(7)}) df2= pd.DataFrame({‘key‘: [‘a‘, ‘b‘, ‘d‘], ‘data2‘: range(3)}) pd.merge(df1,df2) pd.merge(df1, df2, on=‘key‘) #显示指定 df3 = pd.DataFrame({‘lkey‘: [‘b‘, ‘b‘, ‘a‘, ‘c‘, ‘a‘, ‘a‘, ‘b‘], ‘data1‘: range(7)}) df4 = pd.DataFrame({‘rkey‘: [‘a‘, ‘b‘, ‘d‘], ‘data2‘: range(3)}) pd.merge(df3, df4, left_on=‘lkey‘, right_on=‘rkey‘) #不同列名key内连接 pd.merge(df1, df2, how=‘outer‘) #外连接 pd.merge(df1, df2, how=‘outer‘) #inner left right output left = pd.DataFrame({‘key1‘: [‘foo‘, ‘foo‘, ‘bar‘], ‘key2‘: [‘one‘, ‘two‘, ‘one‘], ‘lval‘: [1, 2, 3]}) right = pd.DataFrame({‘key1‘: [‘foo‘, ‘foo‘, ‘bar‘, ‘bar‘], ‘key2‘: [‘one‘, ‘one‘, ‘one‘, ‘two‘], ‘rval‘: [4, 5, 6, 7]}) pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘outer‘) #key1 key2作为一个元组进行外连接 #以下是等价的 默认后缀和指定后缀 pd.merge(left, right, on=‘key1‘) pd.merge(left, right, on=‘key1‘, suffixes=(‘_left‘, ‘_right‘)) #基于index索引的合并 left1 = pd.DataFrame({‘key‘: [‘a‘, ‘b‘, ‘a‘, ‘a‘, ‘b‘, ‘c‘], ‘value‘: range(6)}) right1 = pd.DataFrame({‘group_val‘: [3.5, 7]}, index=[‘a‘, ‘b‘]) pd.merge(left1, right1, left_on=‘key‘, right_index=True) pd.merge(left1, right1, left_on=‘key‘, right_index=True, how=‘outer‘) lefth= pd.DataFrame({‘key1‘: [‘Ohio‘, ‘Ohio‘, ‘Ohio‘, ‘Nevada‘, ‘Nevada‘],‘key2‘:[2000,2001, 2002, 2001, 2002], ‘data‘: np.arange(5.)}) righth=pd.DataFrame(np.arange(12).reshape((6, 2)), index=[[‘Nevada‘, ‘Nevada‘, ‘Ohio‘, ‘Ohio‘, ‘Ohio‘, ‘Ohio‘], [2001, 2000, 2000, 2000, 2001, 2002]], columns=[‘event1‘, ‘event2‘]) pd.merge(lefth, righth, left_on=[‘key1‘, ‘key2‘], right_index=True) pd.merge(lefth, righth, left_on=[‘key1‘, ‘key2‘], right_index=True,how=‘outer‘) left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=[‘a‘, ‘c‘, ‘e‘], columns=[‘Ohio‘, ‘Nevada‘]) right2= pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=[‘b‘, ‘c‘, ‘d‘, ‘e‘], columns=[‘Missouri‘, ‘Alabama‘]) pd.merge(left2, right2, how=‘outer‘, left_index=True, right_index=True) left2.join(right2, how=‘outer‘) left1.join(right1, on=‘key‘) another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=[‘a‘, ‘c‘, ‘e‘, ‘f‘], columns=[‘New York‘, ‘Oregon‘]) left2.join([right2, another]) left2.join([right2, another], how=‘outer‘)
另一种合并数据集
arr = np.arange(12).reshape((3, 4)) np.concatenate([arr, arr], axis=1) s1 = pd.Series([0, 1], index=[‘a‘, ‘b‘]) s2 = pd.Series([2, 3, 4], index=[‘c‘, ‘d‘, ‘e‘]) s3 = pd.Series([5, 6], index=[‘f‘, ‘g‘]) pd.concat([s1,s2,s3]) pd.concat([s1, s2, s3], axis=1) s4 = pd.concat([s1, s3]) # s4 = pd.concat([s1, s3],axis=0) pd.concat([s1, s4], axis=1) #外连接 pd.concat([s1, s4], axis=1,join=‘inner‘) result = pd.concat([s1, s1, s3], keys=[‘one‘, ‘two‘, ‘three‘]) result.unstack() pd.concat([s1, s2, s3], axis=1, keys=[‘one‘, ‘two‘, ‘three‘]) #通过指定key区分结果集 #同样的方式应用有frame df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=[‘a‘, ‘b‘, ‘c‘], columns=[‘one‘, ‘two‘]) df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=[‘a‘, ‘c‘], columns=[‘three‘, ‘four‘]) pd.concat([df1, df2], axis=1, keys=[‘level1‘, ‘level2‘]) pd.concat({‘level1‘: df1, ‘level2‘: df2}, axis=1) pd.concat([df1, df2], axis=1, keys=[‘level1‘, ‘level2‘],names=[‘upper‘, ‘lower‘]) df1 = pd.DataFrame(np.random.randn(3, 4), columns=[‘a‘, ‘b‘, ‘c‘, ‘d‘]) df2 = pd.DataFrame(np.random.randn(2, 3), columns=[‘b‘, ‘d‘, ‘a‘]) pd.concat([df1, df2], ignore_index=True)
覆盖的方式合并Combining Data with Overlap
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=[‘f‘, ‘e‘, ‘d‘, ‘c‘, ‘b‘, ‘a‘]) b = pd.Series(np.arange(len(a), dtype=np.float64), index=[‘f‘, ‘e‘, ‘d‘, ‘c‘, ‘b‘, ‘a‘]) b[-1] = np.nan np.where(pd.isnull(a), b, a) b[:-2].combine_first(a[2:]) df1 = pd.DataFrame({‘a‘: [1., np.nan, 5., np.nan], ‘b‘: [np.nan, 2., np.nan, 6.], ‘c‘: range(2, 18, 4)}) df2 = pd.DataFrame({‘a‘: [5., 4., np.nan, 3., 7.], ‘b‘: [np.nan, 3., 4., 6., 8.]}) df1.combine_first(df2) #df1为空时 用df2对应值替换
Reshaping and Pivoting
data = pd.DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index([‘Ohio‘, ‘Colorado‘], name=‘state‘), columns=pd.Index([‘one‘, ‘two‘, ‘three‘], name=‘number‘)) result = data.stack() result.unstack() result.unstack(‘state‘) s1 = pd.Series([0, 1, 2, 3], index=[‘a‘, ‘b‘, ‘c‘, ‘d‘]) s2 = pd.Series([4, 5, 6], index=[‘c‘, ‘d‘, ‘e‘]) data2 = pd.concat([s1, s2], keys=[‘one‘, ‘two‘]) data2.unstack() data2.unstack().stack() data2.unstack().stack(dropna=False) df = pd.DataFrame({‘left‘: result, ‘right‘: result + 5}, columns=pd.Index([‘left‘, ‘right‘], name=‘side‘)) df.unstack(‘state‘) df.unstack(‘state‘).stack(‘side‘)
原文:https://www.cnblogs.com/excellence/p/13047145.html