首页 > 编程语言 > 详细

python学习-数据规整

时间:2020-06-05 00:52:05      阅读:60      评论:0      收藏:0      [点我收藏+]

1、分级索引

data = pd.Series(np.random.randn(9),
                 index=[[a, a, a, b, b, c, c, d, d],
                        [1,2,3,1,3,1,2,2,3]])
data.index
data[b]
data[b:d]
data.loc[:, 2]
data.loc[:, 1]
data.unstack()
data.unstack().stack()

frame = pd.DataFrame(np.arange(12).reshape((4, 3)), index=[[a, a, b, b], [1, 2, 1, 2]],
                     columns=[[Ohio, Ohio, Colorado],
                              [Green, Red, Green]])
frame.index.names = [key1, key2]
frame.columns.names = [state, color]
frame[Ohio]

frame.swaplevel(key1,key2) #交换level
frame.sort_index(level=1)  #交换level
frame.swaplevel(0, 1).sort_index(level=0)

#Summary Statistics by Level
frame.sum(level=key1)
frame.sum(level=color, axis=1)
#Indexing with a DataFrame’s columns列作为索引
frame = pd.DataFrame({a: range(7), b: range(7, 0, -1), c: [one, one, one, two, two,
two, two], d:[0,1,2,0,1,2,3]})
frame2 = frame.set_index([c, d])
frame.set_index([c, d], drop=False)
frame2.reset_index()

2、合并数据集

df1= pd.DataFrame({key: [b, b, a, c, a, a, b], data1: range(7)})
df2= pd.DataFrame({key: [a, b, d], data2: range(3)})
pd.merge(df1,df2)
pd.merge(df1, df2, on=key) #显示指定

df3 = pd.DataFrame({lkey: [b, b, a, c, a, a, b], data1: range(7)})
df4 = pd.DataFrame({rkey: [a, b, d], data2: range(3)})
pd.merge(df3, df4, left_on=lkey, right_on=rkey) #不同列名key内连接

pd.merge(df1, df2, how=outer) #外连接

pd.merge(df1, df2, how=outer) #inner left right output

left = pd.DataFrame({key1: [foo, foo, bar], key2: [one, two, one], lval: [1, 2, 3]})
right = pd.DataFrame({key1: [foo, foo, bar, bar], key2: [one, one, one, two],
rval: [4, 5, 6, 7]}) 
pd.merge(left, right, on=[key1, key2], how=outer) #key1 key2作为一个元组进行外连接

#以下是等价的 默认后缀和指定后缀
pd.merge(left, right, on=key1) 
pd.merge(left, right, on=key1, suffixes=(_left, _right))

#基于index索引的合并
left1 = pd.DataFrame({key: [a, b, a, a, b, c], value: range(6)})
right1 = pd.DataFrame({group_val: [3.5, 7]}, index=[a, b])
pd.merge(left1, right1, left_on=key, right_index=True)
pd.merge(left1, right1, left_on=key, right_index=True, how=outer)

lefth= pd.DataFrame({key1: [Ohio, Ohio, Ohio, Nevada, Nevada],key2:[2000,2001, 2002, 2001, 2002], data: np.arange(5.)})
righth=pd.DataFrame(np.arange(12).reshape((6, 2)), index=[[Nevada, Nevada, Ohio, Ohio,
                       Ohio, Ohio],
                      [2001, 2000, 2000, 2000, 2001, 2002]],
               columns=[event1, event2])
pd.merge(lefth, righth, left_on=[key1, key2], right_index=True)
pd.merge(lefth, righth, left_on=[key1, key2], right_index=True,how=outer)

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=[a, c, e],
              columns=[Ohio, Nevada])
right2= pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=[b, c, d, e],
               columns=[Missouri, Alabama])
pd.merge(left2, right2, how=outer, left_index=True, right_index=True)
left2.join(right2, how=outer)
left1.join(right1, on=key)

another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=[a, c, e, f],
                       columns=[New York, Oregon])
left2.join([right2, another])
left2.join([right2, another], how=outer)

另一种合并数据集

arr = np.arange(12).reshape((3, 4))
np.concatenate([arr, arr], axis=1)

s1 = pd.Series([0, 1], index=[a, b])
s2 = pd.Series([2, 3, 4], index=[c, d, e])
s3 = pd.Series([5, 6], index=[f, g])


pd.concat([s1,s2,s3])

pd.concat([s1, s2, s3], axis=1)

s4 = pd.concat([s1, s3]) # s4 = pd.concat([s1, s3],axis=0)

pd.concat([s1, s4], axis=1)  #外连接
pd.concat([s1, s4], axis=1,join=inner)

result = pd.concat([s1, s1, s3], keys=[one, two, three])
result.unstack()
pd.concat([s1, s2, s3], axis=1, keys=[one, two, three]) #通过指定key区分结果集

#同样的方式应用有frame
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=[a, b, c], columns=[one, two])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=[a, c], columns=[three, four])
pd.concat([df1, df2], axis=1, keys=[level1, level2])
pd.concat({level1: df1, level2: df2}, axis=1)
pd.concat([df1, df2], axis=1, keys=[level1, level2],names=[upper, lower])

df1 = pd.DataFrame(np.random.randn(3, 4), columns=[a, b, c, d])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=[b, d, a])
pd.concat([df1, df2], ignore_index=True)

覆盖的方式合并Combining Data with Overlap

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=[f, e, d, c, b, a])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=[f, e, d, c, b, a])
b[-1] = np.nan
np.where(pd.isnull(a), b, a)
b[:-2].combine_first(a[2:])

df1 = pd.DataFrame({a: [1., np.nan, 5., np.nan], b: [np.nan, 2., np.nan, 6.],
c: range(2, 18, 4)})
df2 = pd.DataFrame({a: [5., 4., np.nan, 3., 7.],
b: [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2) #df1为空时 用df2对应值替换

Reshaping and Pivoting

data = pd.DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index([Ohio, Colorado], name=state),
                    columns=pd.Index([one, two, three], name=number))
result = data.stack()
result.unstack()
result.unstack(state)

s1 = pd.Series([0, 1, 2, 3], index=[a, b, c, d])
s2 = pd.Series([4, 5, 6], index=[c, d, e])
data2 = pd.concat([s1, s2], keys=[one, two])
data2.unstack()
data2.unstack().stack()
data2.unstack().stack(dropna=False)

df = pd.DataFrame({left: result, right: result + 5}, columns=pd.Index([left, right], name=side))
df.unstack(state)
df.unstack(state).stack(side)

 

 

python学习-数据规整

原文:https://www.cnblogs.com/excellence/p/13047145.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!