类似于一维数组的对象: 由一组数据(各种numpy对象)和一组与之相关的索引组成。分别有两个属性,.values 和 .index
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
from numpy import random
#直接定义Series
obj=Series([-4,7,8,9],index=[‘a‘,‘b‘,‘c‘,‘d‘])
#用字典创建Series
dict={‘a‘:-4,‘b‘:7,‘c‘:8,‘d‘:9}
obj1=Series(dict)
obj[‘a‘]
obj[[‘a‘,‘b‘]]
a -4
b 7
dtype: int64
obj[obj>0]
b 7
c 8
d 9
dtype: int64
#在有index的Series传入index
obj1=Series(obj,index=[‘c‘,‘d‘,‘e‘,‘f‘])
#在有index的dict传入index
obj2=Series(dict,index=[‘c‘,‘d‘,‘e‘,‘f‘])
是表格型数据结构,含有一组有序的列,每列可以是不同的值类型。
可看作是由一组Series组成(共用同一个索引)
#直接定义
data = [[1,2,3],[4,5,6]]
index = [0,1]
columns=[‘a‘,‘b‘,‘c‘]
df = pd.DataFrame(data=data, index=index, columns=columns)
#利用dict转成DataFrame
dict={‘a‘:[1,2,3],‘b‘:[4,5,6],‘c‘:[7,8,9]}
df1=DataFrame(dict,index=[‘one‘,‘two‘,‘three‘])
df1[‘a‘]
df1.a
one 1
two 2
three 3
Name: a, dtype: int64
df1[‘a‘]=1
df1[‘d‘]=4
del df1[‘d‘]
.reindex()
df2=df1.reindex(index=[‘two‘,‘one‘,‘four‘,‘three‘],columns=[‘b‘,‘c‘,‘a‘,‘e‘],fill_value=22)
print (df2)
b c a e
two 5 8 1 22
one 4 7 1 22
four 22 22 22 22
three 6 9 1 22
df3=df2.drop(‘four‘)
df3=df2.drop(‘e‘,axis=1)
s1=df3[‘b‘]
s1[‘two‘]
s1[[‘two‘,‘one‘]] # 用数组列出,要用[ ]括起来
s1[‘two‘:‘four‘] # 标签切片的右区间是闭合的
s1[1:3] # 标号切片的右区间是开放的
one 4
four 22
Name: b, dtype: int64
1. 列索引:直接用列的字段名索引 (注意:行索引不能直接用字段名)
df3[‘b‘]
df3[[‘b‘,‘c‘]]
b | c | |
---|---|---|
two | 5 | 8 |
one | 4 | 7 |
four | 22 | 22 |
three | 6 | 9 |
2. .loc 通过标签索引数据
df3.loc[[‘two‘,‘one‘]] #索引多行,行名用数组
df3.loc[‘two‘:‘three‘] #索引多行,行名用切片
df3.loc[‘two‘,[‘b‘,‘a‘]] #索引某行多列,列名用数组
df3.loc[‘two‘,‘b‘:‘a‘] #索引某行多列,列名用切片
df3.loc[:,[‘b‘,‘a‘]] #索引某列
b | a | |
---|---|---|
two | 5 | 1 |
one | 4 | 1 |
four | 22 | 22 |
three | 6 | 1 |
3..iloc 通过标号获取数据
df3.iloc[1:3,1:3]
c | a | |
---|---|---|
one | 7 | 1 |
four | 22 | 22 |
.ix 结合前两者的混合索引,可同时使用标签和行号,注意:目前.ix已弃用
df3[df3[‘a‘]<2] #先用某一列过滤,返回一个布尔数组,然后通过布尔数组再过滤行
b | c | a | |
---|---|---|---|
two | 5 | 8 | 1 |
one | 4 | 7 | 1 |
three | 6 | 9 | 1 |
df3[df3<2]=0 #选取特定的值重新赋值
s1 = df3.loc[‘two‘]
s2 = df3[‘b‘]
# df4=df3+s1
df4=df3.add(s1)
print (df3)
print (s1)
print (df4)
b c a
two 5 8 0
one 4 7 0
four 22 22 22
three 6 9 0
b 5
c 8
a 0
Name: two, dtype: int64
b c a
two 10 16 0
one 9 15 0
four 27 30 22
three 11 17 0
df5=df3.add(s2,axis=0) #指定s2的索引是匹配行标签,则是按列广播
print (df3)
print (s2)
print (df5)
b c a
two 5 8 0
one 4 7 0
four 22 22 22
three 6 9 0
two 5
one 4
four 22
three 6
Name: b, dtype: int64
b c a
two 10 13 5
one 8 11 4
four 44 44 44
three 12 15 6
1.根据任意轴索引排序
df3.sort_index() #默认axis=0,按行标签排序
df3.sort_index(axis=1,ascending=False) #按列标签排序
c | b | a | |
---|---|---|---|
two | 8 | 5 | 0 |
one | 7 | 4 | 0 |
four | 22 | 22 | 22 |
three | 9 | 6 | 0 |
2.根据某个列值排序
df3.sort_values(by=‘b‘)
b | c | a | |
---|---|---|---|
one | 4 | 7 | 0 |
two | 5 | 8 | 0 |
three | 6 | 9 | 0 |
four | 22 | 22 | 22 |
3.排名
df3.rank(axis=0,method=‘first‘) # axis=0 为按列排名, axis=1 为按行排名
b | c | a | |
---|---|---|---|
two | 2.0 | 2.0 | 1.0 |
one | 1.0 | 1.0 | 2.0 |
four | 4.0 | 4.0 | 4.0 |
three | 3.0 | 3.0 | 3.0 |
Series中的方法
1.唯一值
obj=Series([‘a‘,‘a‘,‘c‘,‘d‘,‘e‘,‘a‘,‘b‘])
obj.unique() # 返回的是一个nparray
array([‘a‘, ‘c‘, ‘d‘, ‘e‘, ‘b‘], dtype=object)
2.值计数
obj1=obj.value_counts() #常用于统计某个离散值各个类别的频率
print(obj1)
a 3
d 1
c 1
b 1
e 1
dtype: int64
3.成员资格
obj2=obj.isin([‘b‘,‘c‘])
print(obj2)
0 False
1 False
2 True
3 False
4 False
5 False
6 True
dtype: bool
1.滤除缺失数据
Series的方法
import numpy as np
from numpy import nan as NA
data=Series([1,NA,3.5,NA,10])
data.dropna() #用.dropna()的方法
data[data.notnull()] #通过布尔索引来过滤
0 1.0
2 3.5
4 10.0
dtype: float64
DataFrame的方法
df3[‘d‘]=NA
df4=df3.dropna(axis=1) #axis=0 为丢掉所有含NAN的行,axis=1 为丢掉所有含NAN的列
df4=df3.dropna(how=‘all‘) # how=‘all‘
df4=df3.dropna(thresh=2)
2.填充缺失数据
df3[‘d‘]=[6,7,8,9]
df3.iloc[0:2,2:4]=NA
df3.fillna(1) #可对所有缺失值赋值
df3.fillna({‘a‘:4,‘d‘:5}) #可用词典对不同的列填充不同的值
b | c | a | d | |
---|---|---|---|---|
two | 5 | 8 | 4.0 | 5.0 |
one | 4 | 7 | 4.0 | 5.0 |
four | 22 | 22 | 22.0 | 8.0 |
three | 6 | 9 | 0.0 | 9.0 |
1.创建层次化索引
data=Series(np.random.randn(6),index=[[‘a‘,‘a‘,‘b‘,‘b‘,‘c‘,‘c‘],[1,2,1,2,1,2]])
print(data)
a 1 -2.403726
2 0.534817
b 1 1.007619
2 -0.555399
c 1 -0.715177
2 -0.850348
dtype: float64
2.Series层次化索引的切片操作
data[‘a‘:‘b‘]
data.loc[[‘a‘,‘c‘]]
data.loc[:,2] #类似DataFrame中先选行再选列,在层次化索引中先选外层再选内层
a 0.534817
b -0.555399
c -0.850348
dtype: float64
3.层次化索引与DataFrame间的转换
data.unstack() #从层次化索引转换成DataFrame
data.unstack().stack() #从DataFrame转换成层次化索引
a 1 -2.403726
2 0.534817
b 1 1.007619
2 -0.555399
c 1 -0.715177
2 -0.850348
dtype: float64
1.创建层次化索引
frame=DataFrame(np.arange(12).reshape(4,3),index=[[‘a‘,‘a‘,‘b‘,‘b‘],[1,2,1,2]],columns=[[‘Ohio‘,‘Ohio‘,‘Colorado‘],[‘Green‘,‘Red‘,‘Green‘]])
frame.index.names=[‘key1‘,‘key2‘]
frame.columns.names=[‘state‘,‘color‘]
frame
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
2.重排分级顺序
frame.swaplevel(‘key1‘,‘key2‘)
# frame.swaplevel(‘key1‘,‘key2‘).sort_index(level=‘key2‘)
frame.swaplevel(0,1).sort_index(level=0)
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
b | 6 | 7 | 8 | |
2 | a | 3 | 4 | 5 |
b | 9 | 10 | 11 |
3.将某列作为索引
frame=DataFrame({‘a‘:range(7),‘b‘:range(7,0,-1),‘c‘:[‘one‘,‘one‘,‘one‘,‘two‘,‘two‘,‘two‘,‘two‘],‘d‘:[0,1,2,0,1,2,3]})
frame.set_index([‘c‘,‘d‘]) #把某列作为行索引,该列从DataFrame中移除
frame.set_index([‘c‘,‘d‘],drop=False) #把某列作为行索引,该列在DataFrame中保留
frame.reset_index( ) #将层次化索引的级别会转到列里面
index | a | b | c | d | |
---|---|---|---|---|---|
0 | 0 | 0 | 7 | one | 0 |
1 | 1 | 1 | 6 | one | 1 |
2 | 2 | 2 | 5 | one | 2 |
3 | 3 | 3 | 4 | two | 0 |
4 | 4 | 4 | 3 | two | 1 |
5 | 5 | 5 | 2 | two | 2 |
6 | 6 | 6 | 1 | two | 3 |
4.DataFrame层次化索引的切片操作
frame=DataFrame(np.arange(12).reshape(4,3),index=[[‘a‘,‘a‘,‘b‘,‘b‘],[1,2,1,2]],columns=[[‘Ohio‘,‘Ohio‘,‘Colorado‘],[‘Green‘,‘Red‘,‘Green‘]])
frame.index.names=[‘key1‘,‘key2‘]
frame.columns.names=[‘state‘,‘color‘]
frame
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
(1)列索引可以直接用列名
frame[‘Ohio‘] #取最外层的列标签
frame[‘Ohio‘,‘Green‘] #取内层的单个列标签
frame[‘Ohio‘][[‘Red‘,‘Green‘]] #取内层的多个列标签,先取外层得出一个DataFrame,再取内层
color | Red | Green | |
---|---|---|---|
key1 | key2 | ||
a | 1 | 1 | 0 |
2 | 4 | 3 | |
b | 1 | 7 | 6 |
2 | 10 | 9 |
(2)行索引 要用.loc的方法
frame.loc[‘a‘] #取最外层的行标签
frame.loc[‘a‘,1] #取内层层的单个行标签
frame.ix[‘a‘,[1,2]] #取内层层的单个行标签
E:\WinPython\WPy-3662\python-3.6.6.amd64\lib\site-packages\ipykernel_launcher.py:3: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
This is separate from the ipykernel package so we can avoid doing imports until
state | Ohio | Colorado |
---|---|---|
color | Red | Green |
key2 | ||
1 | 1 | 2 |
2 | 4 | 5 |
(3)行列一起索引
frame.loc[‘a‘][‘Ohio‘,‘Green‘] #先索引行标签,在基础上取列
key2
1 0
2 3
Name: (Ohio, Green), dtype: int32
df=DataFrame({‘key1‘:[‘a‘,‘a‘,‘b‘,‘b‘,‘a‘],‘key2‘:[‘one‘,‘two‘,‘one‘,‘two‘,‘one‘],‘data1‘:np.random.randn(5),‘data2‘:np.random.randn(5)})
df
key1 | key2 | data1 | data2 | |
---|---|---|---|---|
0 | a | one | -0.847544 | 0.003861 |
1 | a | two | 1.510283 | 0.623520 |
2 | b | one | -0.099401 | 0.038992 |
3 | b | two | 0.191581 | -0.927451 |
4 | a | one | 0.586113 | -0.571834 |
df[‘data1‘].groupby(df[‘key1‘]).mean()
df[‘data1‘].groupby([df[‘key1‘],df[‘key2‘]]).mean()
year=np.array([2005,2005,2006,2005,2006])
df[‘data1‘].groupby(year).mean()
2005 0.284774
2006 0.243356
Name: data1, dtype: float64
df.groupby(‘key1‘).mean() #结果中没有key2,因为key2是非数值列被排除
data1 | data2 | |
---|---|---|
key1 | ||
a | 0.416284 | 0.018515 |
b | 0.046090 | -0.444229 |
groupby 的.size() 的方法。可以返回一个表示各个分组大小的Series
df.groupby([‘key1‘,‘key2‘]).size()
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
对行索引的层级汇总
frame.sum(level=‘key1‘)
state | Ohio | Colorado | |
---|---|---|---|
color | Green | Red | Green |
key1 | |||
a | 3 | 5 | 7 |
b | 15 | 17 | 19 |
对列索引的层级汇总,要用axis指定
frame.sum(level=‘color‘,axis=1)
df.groupby(‘key1‘) #返回的是一个groupby对象
for key1,group in df.groupby(‘key1‘):
print (key1)
print (group)
for (key1,key2),group in df.groupby([‘key1‘,‘key2‘]):
print (key1,key2)
print (group)
a
key1 key2 data1 data2
0 a one -0.847544 0.003861
1 a two 1.510283 0.623520
4 a one 0.586113 -0.571834
b
key1 key2 data1 data2
2 b one -0.099401 0.038992
3 b two 0.191581 -0.927451
a one
key1 key2 data1 data2
0 a one -0.847544 0.003861
4 a one 0.586113 -0.571834
a two
key1 key2 data1 data2
1 a two 1.510283 0.62352
b one
key1 key2 data1 data2
2 b one -0.099401 0.038992
b two
key1 key2 data1 data2
3 b two 0.191581 -0.927451
#以下两种方法是等价的
df[‘data1‘].groupby(df[‘key1‘])
df.groupby(‘key1‘)[‘data1‘]
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001C35A75F358>
#groupby转成字典
# dict(list(df.groupby(‘key1‘)))
#字典转成groupby
mapping={‘data1‘:‘a‘,‘data2‘:‘b‘,‘key1‘:‘c‘,‘key2‘:‘d‘}
df.groupby(mapping,axis=1).sum() #以后面的值为分组键
a | b | c | d | |
---|---|---|---|---|
0 | -0.847544 | 0.003861 | a | one |
1 | 1.510283 | 0.623520 | a | two |
2 | -0.099401 | 0.038992 | b | one |
3 | 0.191581 | -0.927451 | b | two |
4 | 0.586113 | -0.571834 | a | one |
【原】《利用Python进行数据分析》学习笔记之Pandas基础
原文:https://www.cnblogs.com/laiyaling/p/10062278.html