Pandas学习笔记

时间：2020-01-04 14:58:29 阅读：96 评论：0 收藏：0 [点我收藏+]

认识pandas

import pandas as pd
import  numpy as np
?
#创建数组
# t1 = pd.Series([1,23,44,5,56,6])
# print(t1)
‘‘‘
0     1
1    23
2    44
3     5
4    56
5     6
‘‘‘
?
#指定索引值
# t2 = pd.Series(np.arange(5),index=list(‘abcde‘))
# print(t2)
‘‘‘
a    0
b    1
c    2
d    3
e    4
dtype: int32
‘‘‘
?
#通过字典创建
?
# dict = {‘name‘:‘tony‘,‘age‘:23,‘sex‘:"fale",‘tel‘:10012}
# t3 = pd.Series(dict)
# print(t3)
?
‘‘‘
name     tony
age        23
sex      fale
tel     10012
dtype: object #对象，代表数组中含有字符串
‘‘‘
?
#pandas切片和索引
?
# print(t3[1]) #  23 按默认索引值取
# print(t3[‘age‘])  #  23 按字典键值取
?
#取连续多行
?
#print(t3[:2])
‘‘‘
name    tony
age       23
dtype: object
‘‘‘
?
#取不连续多行
# print(t3[[1,2]])
‘‘‘
age      23
sex    fale
dtype: object
‘‘‘
# print(t3[[‘age‘,‘tel‘]])
‘‘‘
age       23
tel    10012
dtype: object
‘‘‘
?
#取满足条件的数组
# t4 = pd.Series([1,5,66,23,44,5,56,6])
# print(t4[t4>10])
‘‘‘
2    66
3    23
4    44
6    56
‘‘‘
?
#对于一个陌生的series类型， 我们如何知道他的索引和具体的值呢
dict = {‘name‘:‘tony‘,‘age‘:23,‘sex‘:"fale",‘tel‘:10012}
t3 = pd.Series(dict)
?
#索引值
# print(t3.index) # Index([‘name‘, ‘age‘, ‘sex‘, ‘tel‘], dtype=‘object‘)
# for i in t3.index:
    # print(i)
‘‘‘
name
age
sex
tel
‘‘‘
# print(list(t3.index)[:2]) #[‘name‘, ‘age‘]
?
?
#具体值
# print(t3.values) # [‘tony‘ 23 ‘fale‘ 10012]
# print(type(t3.values)) #<class ‘numpy.ndarray‘>

pandas读取外部数据

import pandas as pd
import  numpy as np
?
# 读取美国YouTube.csv文件内容
# df = pd.read_csv(‘./美国YouTube.csv‘)
#将内容likes这一列进行从大到小排序
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)
?
#取likes数组大于10000小于100000的对象
# print(df[(10000<df[‘likes‘])&(df[‘likes‘]<100000)])
?
##取likes数组大于10000且名字的长度大于4的目标对象
# print(df[(10000<df[‘likes‘])&(df[‘name‘].str.len()>4)])
‘‘‘
         name  view_count   likes  dislikes  comment_count
6       ttttt     2491513  426184      8606          27555
2       ttttt     2744190  230435      5618          21527
146     ttttt     2455037  210879      4738          19801
175    ggggbr      556577  126773      1690          12318
86     ggggbr      673533   91355      1482           8204
44      ttttt     1839655   86827      3182           9375
53      ttttt      792211   83095      2407           2830
13   adsfxgch      494522   61940      1522           6246
10      ttttt      312876   59976       930           3459
5      ggggbr      426563   52433      1214           1602
27      ttttt      359408   47662       588           1373
22     ggggbr      449180   43051       456           1660
18     ggggbr     1307564   42445     18635          17629
47   adsfxgch      233780   37787       420           1380
23      ttttt      178437   34479       401           3843
162    ggggbr      228233   34224       285           1520
121     ttttt      474103   30240      3334           9357
19      ttttt      206338   27231       520           2685
78      ttttt     1464286   24908      1880            818
142     ttttt       91334   22402       484           1968
70      ttttt      100288   19004       148           1202
43     ggggbr      133892   18874       205           1587
192    ggggbr       53804   13622        42            767
1      ggggbr      764890   13586       364           1249
112     ttttt       94626   12942       218            847
73     ggggbr      151609   11916       117            618
171    ggggbr      122858   10916       285           3381
‘‘‘

pandas之DataFrame

import pandas as pd
import  numpy as np
?
#DataFrame
?
# t1 = pd.DataFrame(np.arange(12).reshape((3,4)))
# print(t1)
‘‘‘
   0  1   2   3  #列索引
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
?
DataFrame对象既有行索引， 又有列索引
行索引， 表明不同行， 横向索引， 叫index， 0轴， axis=0
列索引， 表名不同列， 纵向索引， 叫columns， 1轴，axis=1
‘‘‘
?
# t2 = pd.DataFrame(np.arange(12).reshape((3,4)),index=list(‘abc‘),columns=list(‘ABCD‘))
# print(t2)
‘‘‘
   A  B   C   D
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
‘‘‘
?
# dict = {‘name‘:[‘tony‘,‘alex‘],‘age‘:[23,12],‘sex‘:["fale",‘female‘],‘tel‘:[10012,10010]}
# t3 = pd.DataFrame(dict)
# print(t3)
‘‘‘
   name  age     sex    tel
0  tony   23    fale  10012
1  alex   12  female  10010
‘‘‘
?
dict2 = [{‘name‘:‘tony1‘,‘age‘:21,‘sex‘:"fale1",‘tel‘:10011},
         {‘name‘:‘tony2‘,‘age‘:22,‘sex‘:"fale2",},
         {‘name‘:‘tony3‘,‘age‘:23,‘tel‘:10013}]
t4 = pd.DataFrame(dict2)
# print(t4) #如果哦对应值缺失，则显示NaN
‘‘‘
    name  age    sex      tel
0  tony1   21  fale1  10011.0
1  tony2   22  fale2      NaN
2  tony3   23    NaN  10013.0
‘‘‘
# print(t4.shape) # (3, 4)
?
# print(t4.dtypes) # 每一列的数据类型
‘‘‘
name     object
age       int64
sex      object
tel     float64
‘‘‘
?
# print(t4.ndim) # 2 数据维度
?
# print(t4.index) # RangeIndex(start=0, stop=3, step=1) ,行索引
#
# print(t4.columns) # Index([‘name‘, ‘age‘, ‘sex‘, ‘tel‘], dtype=‘object‘) 列索引
#
# print(t4.values)  #对象值
‘‘‘
[[‘tony1‘ 21 ‘fale1‘ 10011.0]
 [‘tony2‘ 22 ‘fale2‘ nan]
 [‘tony3‘ 23 nan 10013.0]]
‘‘‘
#显示前几行
# print(t4.head(2))
‘‘‘
    name  age    sex      tel
0  tony1   21  fale1  10011.0
1  tony2   22  fale2      NaN
‘‘‘
?
#显示尾部几行
# print(t4.tail(2))
‘‘‘
    name  age    sex      tel
1  tony2   22  fale2      NaN
2  tony3   23    NaN  10013.0
‘‘‘
?
#显示t4的整体信息
# print(t4.info())
‘‘‘
<class ‘pandas.core.frame.DataFrame‘> #
RangeIndex: 3 entries, 0 to 2  #行信息
Data columns (total 4 columns): #列信息
name    3 non-null object
age     3 non-null int64
sex     2 non-null object
tel     2 non-null float64
dtypes: float64(1), int64(1), object(2) #出现的数据类型种类及次数
memory usage: 224.0+ bytes   #储存大小
None
‘‘‘
?
#对于数组中为数字的数据类型列进行快速的统计
# print(t4.describe()) #统计t4中的age和tel两个数字类型列的信息
‘‘‘
        age           tel
count   3.0      2.000000
mean   22.0  10012.000000
std     1.0      1.414214
min    21.0  10011.000000
25%    21.5  10011.500000
50%    22.0  10012.000000
75%    22.5  10012.500000
max    23.0  10013.000000
‘‘‘
?
df = pd.read_csv(‘./美国YouTube.csv‘) #读取当前目录下的文件
# print(df)
# print(df.head()) #默认读取前5行
# print(df.info())
‘‘‘
<class ‘pandas.core.frame.DataFrame‘>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
view_count       200 non-null int64
likes            200 non-null int64
dislikes         200 non-null int64
comment_count    200 non-null int64
dtypes: int64(4)
memory usage: 6.4 KB
None
‘‘‘
?
#dataFrame中排序的方法
#将likes这一列进行排序，默认是升序，ascending=False为降序
?
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)

pandas中取行和列

import pandas as pd
import  numpy as np
?
#读取美国YouTube.csv文件内容
# df = pd.read_csv(‘./美国YouTube.csv‘)
# #将内容likes这一列进行从大到小排序
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)
?
?
#取前20行
#
# print(df[:20])
?
#取某列，直接把列索引填入即可
# print(df[‘likes‘])
?
#取前20行的likes列值
# print(df[:20][‘likes‘])
?
?
?
#t.loc通过标签索引行数据
?
t = pd.DataFrame(np.arange(12).reshape((3,4)),index=list(‘abc‘),columns=list(‘WXYZ‘))
# print(t)
‘‘‘
   W  X   Y   Z
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
‘‘‘
#取a行
# print(t.loc[‘a‘])
# #或
# print(t.loc[‘a‘,:])
?
#取Z列
# print(t.loc[:,‘Z‘])
?
#取多行和多列
?
#取不连续多行
# print(t.loc[[‘a‘,‘c‘]]) #两个中括号
‘‘‘
   W  X   Y   Z
a  0  1   2   3
c  8  9  10  11
‘‘‘
?
#取多列
# print(t.loc[:,[‘W‘,‘Y‘]]) #两个中括号
‘‘‘
   W   Y
a  0   2
b  4   6
c  8  10
‘‘‘
?
#同时取多行和多列
# print(t.loc[[‘a‘,‘c‘],[‘W‘,‘Y‘]]) #两个中括号
‘‘‘
   W   Y
a  0   2
c  8  10
‘‘‘
?
# print(t.loc[‘a‘:‘c‘,[‘W‘,‘Y‘]])  # ‘a‘:‘c‘表示从a到c行都可以取到
‘‘‘
   W   Y
a  0   2
b  4   6
c  8  10
‘‘‘
?
#t.iloc通过位置索引行数据
?
# #索引值为1的行
# print(t.iloc[1,:])
?
#取不连续多行
# print(t.iloc[[0,2]])
‘‘‘
   W  X   Y   Z
a  0  1   2   3
c  8  9  10  11
‘‘‘
?
#索引值为1的列
# print(t.iloc[:,1])
?
#取多列
# print(t.iloc[:,[1,3]])
‘‘‘
   X   Z
a  1   3
b  5   7
c  9  11
‘‘‘
?
#取多行多列
# print(t.iloc[[1,2],[1,3]])
‘‘‘
   X   Z
b  5   7
c  9  11
‘‘‘
?
# print(t.iloc[1:,:2])
‘‘‘
   W  X
b  4  5
c  8  9
‘‘‘
t.iloc[1:,:2]=np.nan
print(t)
‘‘‘
     W    X   Y   Z
a  0.0  1.0   2   3
b  NaN  NaN   6   7
c  NaN  NaN  10  11
‘‘‘

pandas之布尔索引

import pandas as pd
import  numpy as np
?
# 读取美国YouTube.csv文件内容
# df = pd.read_csv(‘./美国YouTube.csv‘)
#将内容likes这一列进行从大到小排序
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)
?
#取likes数组大于10000小于100000的对象
# print(df[(10000<df[‘likes‘])&(df[‘likes‘]<100000)])
?
##取likes数组大于10000且名字的长度大于4的目标对象
# print(df[(10000<df[‘likes‘])&(df[‘name‘].str.len()>4)])
‘‘‘
         name  view_count   likes  dislikes  comment_count
6       ttttt     2491513  426184      8606          27555
2       ttttt     2744190  230435      5618          21527
146     ttttt     2455037  210879      4738          19801
175    ggggbr      556577  126773      1690          12318
86     ggggbr      673533   91355      1482           8204
44      ttttt     1839655   86827      3182           9375
53      ttttt      792211   83095      2407           2830
13   adsfxgch      494522   61940      1522           6246
10      ttttt      312876   59976       930           3459
5      ggggbr      426563   52433      1214           1602
27      ttttt      359408   47662       588           1373
22     ggggbr      449180   43051       456           1660
18     ggggbr     1307564   42445     18635          17629
47   adsfxgch      233780   37787       420           1380
23      ttttt      178437   34479       401           3843
162    ggggbr      228233   34224       285           1520
121     ttttt      474103   30240      3334           9357
19      ttttt      206338   27231       520           2685
78      ttttt     1464286   24908      1880            818
142     ttttt       91334   22402       484           1968
70      ttttt      100288   19004       148           1202
43     ggggbr      133892   18874       205           1587
192    ggggbr       53804   13622        42            767
1      ggggbr      764890   13586       364           1249
112     ttttt       94626   12942       218            847
73     ggggbr      151609   11916       117            618
171    ggggbr      122858   10916       285           3381
‘‘‘

pandas缺失值处理

import pandas as pd
import  numpy as np
?
t = pd.DataFrame(np.arange(24).reshape((4,6)),index=list(‘ABCD‘),columns=list(‘UVWXYZ‘))
t.iloc[1:,:2]=np.nan
t.iloc[1,5]=0
# print(t)
‘‘‘
     U    V   W   X   Y   Z
A  0.0  1.0   2   3   4   5
B  NaN  NaN   8   9  10   0
C  NaN  NaN  14  15  16  17
D  NaN  NaN  20  21  22  23
?
?
?
我们的数据缺失通常有两种情况：
一种就是空， None等， 在pandas是NaN(和np.nan一样)
另一种是我们让其为0， 蓝色框中
‘‘‘
?
#判断数据是否为NaN： pd.isnull(df),pd.notnull(df)
?
# print(pd.isnull(t))
‘‘‘
A  False  False  False  False  False  False
B   True   True  False  False  False  False
C   True   True  False  False  False  False
D   True   True  False  False  False  False
‘‘‘
# print(pd.notnull(t))
‘‘‘
A   True   True  True  True  True  True
B  False  False  True  True  True  True
C  False  False  True  True  True  True
D  False  False  True  True  True  True
‘‘‘
?
#取U列中，不为NaN的行
# print(t[pd.notnull(t[‘U‘])])
‘‘‘
     U    V  W  X  Y  Z
A  0.0  1.0  2  3  4  5
‘‘‘
?
#处理方式1： 删除NaN所在的行列dropna (axis=0, how=‘any‘, inplace=False)
?
#删除NaN所在的行,默认any,只要有一行有NaN就删除该行
# print(t.dropna(axis=0))
# ‘‘‘
#      U    V  W  X  Y  Z
# A  0.0  1.0  2  3  4  5
# ‘‘‘
?
# print(t.dropna(axis=0,how=‘any‘))
‘‘‘
     U    V  W  X  Y  Z
A  0.0  1.0  2  3  4  5
‘‘‘
?
#只有某一行所有值为NaN就删除该行
# print(t.dropna(axis=0,how=‘all‘))
‘‘‘
     U    V   W   X   Y   Z
A  0.0  1.0   2   3   4   5
B  NaN  NaN   8   9  10   0
C  NaN  NaN  14  15  16  17
D  NaN  NaN  20  21  22  23
‘‘‘
?
?
# inplace是否进行原地修改
# t.dropna(axis=0,how=‘any‘,inplace=True)
# print(t)
‘‘‘
     U    V  W  X  Y  Z
A  0.0  1.0  2  3  4  5
‘‘‘
?
#填充数据fillna
?
#将NaN都填充为5
# print(t.fillna(5))
‘‘‘
     U    V   W   X   Y   Z
A  0.0  1.0   2   3   4   5
B  5.0  5.0   8   9  10   0
C  5.0  5.0  14  15  16  17
D  5.0  5.0  20  21  22  23
‘‘‘
?
#将NaN都填充为均值,此均值为对应的除NaN以外的该列的其他数字均值
#还可填充其他如t.fillna(t.median())
# print(t.fillna(t.mean()))
‘‘‘
     U    V   W   X   Y   Z
A  0.0  1.0   2   3   4   5
B  0.0  1.0   8   9  10   0
C  0.0  1.0  14  15  16  17
D  0.0  1.0  20  21  22  23
?
U列只有一个非NaN值为0，则此平均值为0
V列只有一个非NaN值为1，则此平均值为1
所以U列填充0，V列填充1
‘‘‘
?
#只想填充V列的NaN值
# t[‘V‘] = t[‘V‘].fillna(t[‘V‘].mean())
# print(t)
‘‘‘
     U    V   W   X   Y   Z
A  0.0  1.0   2   3   4   5
B  NaN  1.0   8   9  10   0
C  NaN  1.0  14  15  16  17
D  NaN  1.0  20  21  22  23
‘‘‘
?
?
‘‘‘
处理为0的数据： t[t==0]=np.nan
当然并不是每次为0的数据都需要处理
计算平均值等情况， nan是不参与计算的， 但是0会
‘‘‘

pandas常用统计方法

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt
?
df = pd.read_csv(‘./IMDB-Movie-Data.csv‘)
# print(df.info())
?
# print(df.head(1))
?
#获取电影平均评分
# print(df[‘Rating‘].mean()) # 6.723200000000003
?
#获取导演人数
# print(len(set(df[‘Director‘].tolist()))) #644
# #获取不重复的导演名字
# print(df[‘Director‘].unique())
# print(len(df[‘Director‘].unique())) #644
?
?
#获取演员人数
?
# tmp_actor_list = df[‘Actors‘].str.split(‘,‘).tolist()
# #for j in tmp_actor_list:
##     for i in j:
# #         print(i)
# actor_list = [i for j in tmp_actor_list for i in j]
# actors_nums = len(set(actor_list))
# print(actors_nums) #2394
?
?
# max_runtime  = df[‘Runtime (Minutes)‘].max()
# min_runtime  = df[‘Runtime (Minutes)‘].min()
# mean_runtime  = df[‘Runtime (Minutes)‘].mean()
# median_runtime  = df[‘Runtime (Minutes)‘].median()
# print(max_runtime) #191
# print(min_runtime) #66
# print(mean_runtime) # 113.172
# print(median_runtime) #111.0
?
‘‘‘
1.对于这一组电影数据， 如果我们想rating（电影评分）， runtime（时长）的分布情况，
 应该如何呈现数据？
‘‘‘
# #时长分析
#
# #选择图形，直方图
# #找出数据的最大最小值
#
# runtime_data = df[‘Runtime (Minutes)‘].values
#
# max_runtime = runtime_data.max()
#
# min_runtime = runtime_data.min()
#
# #计算组数
# # print(max_runtime - min_runtime)
# num_bins = (max_runtime - min_runtime)//5 #商取整
# # print(num_bins)
#
# plt.figure(figsize=(20,8),dpi = 80)
# plt.hist(runtime_data,num_bins)
#
# #设置x轴刻度
# plt.xticks(range(min_runtime,max_runtime+5,5))
# plt.savefig(‘./07时长分布.png‘)
# plt.show()
?
?
#rating（电影评分）分析
?
#时长分析
?
#选择图形，直方图
#找出数据的最大最小值
?
# rating_data = df[‘Rating‘].values
#
# max_rating = rating_data.max()
#
# min_rating = rating_data.min()
#
# #计算组数
# # print(max_rating ,min_rating) #9.0 1.9
#
# num_bin_list = [1.6]
# sum = 1.6
# for i in range(11):
#     sum +=0.5
#     num_bin_list.append(sum)
# print(num_bin_list)
#
#
# plt.figure(figsize=(20,8),dpi = 80)
# plt.hist(rating_data,num_bin_list)
#
# #设置x轴刻度
#
#
# plt.savefig(‘./07评分分析.png‘)
# plt.show()
?
‘‘‘
2.对于这一组电影数据， 如果我们希望统计电影分类(genre)的情况， 应该如何处
理数据？
‘‘‘
#即每种类型的电影有多少部
?
#选择条形图
# print(df[‘Genre‘])
#
# #1.统计分类的列表
# temp_list = df[‘Genre‘].str.split(‘,‘).tolist()
# # print(temp_list)
#
# # for j in temp_list:
# #     for i in j:
# #         print(i)
#
# genre_list = list(set([i for j in temp_list for i in j]))
# # print(genre_list)
#
# #构造全为0的数组
#
# zeros_df  = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),
#                          columns=genre_list)
# # print(zeros_df)
#
# #给每个电影出现分类的位置赋值为1
#
# for i in range(df.shape[0]):
#     #zeros_df.loc[0,[Action,Adventure,Sci-Fi]] = 1
#     zeros_df.loc[i,temp_list[i]] = 1
# # print(zeros_df.head(5))
# # print(zeros_df)
#
# #统计每个类目电影的数量
# genre_count = zeros_df.sum(axis = 0)
# # print(genre_count)
#
# #排序
# genre_count = genre_count.sort_values()
# # print(genre_count)
# x = genre_count.index
# y = genre_count.values
#
#
# #画图
# plt.figure(figsize=(20,8),dpi=80)
#
# plt.bar(range(len(x)),y,color = ‘y‘,alpha = 0.6)
# plt.xticks(range(len(x)),x)
#
# plt.savefig(‘./07各类型电影数量.png‘)
# plt.show()

数据合并join和merge

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt
?
#join:默认情况下他是把行索引相同的数据合并到一起
#
# t1 = pd.DataFrame(np.zeros((2,5)),index=list(‘AB‘),columns=list(‘VWXYZ‘))
# print(t1)
# ‘‘‘
#      V    W    X    Y    Z
# A  0.0  0.0  0.0  0.0  0.0
# B  0.0  0.0  0.0  0.0  0.0
# ‘‘‘
#
# t2 = pd.DataFrame(np.ones((3,4)),index=list(‘ABC‘))
# print(t2)
# ‘‘‘
#      0    1    2    3
# A  1.0  1.0  1.0  1.0
# B  1.0  1.0  1.0  1.0
# C  1.0  1.0  1.0  1.0
# ‘‘‘
#
# print(t2.join(t1))
# ‘‘‘
#      0    1    2    3    V    W    X    Y    Z
# A  1.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0
# B  1.0  1.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0
# C  1.0  1.0  1.0  1.0  NaN  NaN  NaN  NaN  NaN
# ‘‘‘
# print(t1.join(t2))
# ‘‘‘
#      V    W    X    Y    Z    0    1    2    3
# A  0.0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
# B  0.0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
# ‘‘‘
?
?
# #merge:默认情况下他是把列索引相同的数据合并到一起
#
#
# df1 = pd.DataFrame(np.ones((2,4)),index=list(‘AB‘),columns=list(‘abcd‘))
# # print(df1)
# ‘‘‘
#      a    b    c    d
# A  1.0  1.0  1.0  1.0
# B  1.0  1.0  1.0  1.0
# ‘‘‘
#
# df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list(‘fax‘))
# # print(df3)
# ‘‘‘
#    f  a  x
# 0  0  1  2
# 1  3  4  5
# 2  6  7  8
# ‘‘‘
#
# # print(df1.merge(df3,on=‘a‘)) #按照a列进行合并，df1中a列有2个值和df3中的a值相同
# ‘‘‘
#      a    b    c    d  f  x
# 0  1.0  1.0  1.0  1.0  0  2
# 1  1.0  1.0  1.0  1.0  0  2
# ‘‘‘
# df1.loc[‘A‘,‘a‘] = 100
# # print(df1)
# ‘‘‘
#        a    b    c    d
# A  100.0  1.0  1.0  1.0
# B    1.0  1.0  1.0  1.0
# ‘‘‘
# # print(df1.merge(df3,on=‘a‘)) #按照a列进行合并，df1中a列有1个值和df3中的a值相同
# ‘‘‘
#      a    b    c    d  f  x
# 0  1.0  1.0  1.0  1.0  0  2
# ‘‘‘
#
# #外连接，并集,NaN补全
# # print(df1.merge(df3,on=‘a‘,how=‘outer‘))
# ‘‘‘
#        a    b    c    d    f    x
# 0  100.0  1.0  1.0  1.0  NaN  NaN
# 1    1.0  1.0  1.0  1.0  0.0  2.0
# 2    4.0  NaN  NaN  NaN  3.0  5.0
# 3    7.0  NaN  NaN  NaN  6.0  8.0
# ‘‘‘
#
# #左连接,以左边df1为基准，NaN补全
# # print(df1.merge(df3,on=‘a‘,how=‘left‘))
# ‘‘‘
#        a    b    c    d    f    x
# 0  100.0  1.0  1.0  1.0  NaN  NaN
# 1    1.0  1.0  1.0  1.0  0.0  2.0
# ‘‘‘
#
# #右连接,以右边df3为基准，NaN补全
# print(df1.merge(df3,on=‘a‘,how=‘right‘))
# ‘‘‘
#      a    b    c    d  f  x
# 0  1.0  1.0  1.0  1.0  0  2
# 1  4.0  NaN  NaN  NaN  3  5
# 2  7.0  NaN  NaN  NaN  6  8
# ‘‘‘

分组和聚合

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt

file_path = ‘./starbucks_store_worldwide.csv‘
df = pd.read_csv(file_path)
# print(df.head(1))
# print(df.info())

##grouped = df.groupby(by=‘Country‘) #按照国家进行分组
# print(grouped)

#DataFrameGroupBy


#可以进行遍历
# for i in grouped: #对国家进行遍历，每个grouped是一个国家的相关数据
#     print(i)
#     print("*"*100)

#调用聚合方法
# print(grouped.count()) #计算每个国家的数量
# country_count = grouped[‘Brand‘].count()

# print(country_count[‘US‘]) #美国星巴克数量
# print(country_count[‘CN‘]) #中国星巴克数量

#统计中国每个省份的星巴克数量
# china_data = df[df[‘Country‘] == ‘CN‘]
# # print(china_data)
#
# grouped = china_data.groupby(by = ‘State/Province‘).count()[‘Brand‘]
# print(grouped)

#数据按照多个条件进行分组，返回Series

#先按国家分组，再按城市分组
# grouped = df[‘Brand‘].groupby(by=[df[‘Country‘],df[‘State/Province‘]]).count()
# print(grouped,type(grouped))
‘‘‘
AD       7                  1
AE       AJ                 2
         AZ                48
         DU                82
         FU                 2
                           ..
US       WV                25
         WY                23
VN       HN                 6
         SG                19
ZA       GT                 3
Name: Brand, Length: 545, dtype: int64 <class ‘pandas.core.series.Series‘>
‘‘‘


#数据按照多个条件进行分组，返回DataFrame,
# 多加了一个方括号***
grouped1 = df[[‘Brand‘]].groupby(by=[df[‘Country‘],df[‘State/Province‘]]).count()
grouped2 = df.groupby(by=[df[‘Country‘],df[‘State/Province‘]])[[‘Brand‘]].count()
grouped3 = df[[‘Brand‘]].groupby(by=[df[‘Country‘],df[‘State/Province‘]]).count()[[‘Brand‘]]
# print(grouped1,type(grouped1))
# print(grouped2,type(grouped2))
# print(grouped3,type(grouped3)) #[545 rows x 1 columns] <class ‘pandas.core.frame.DataFrame‘>

‘‘‘
以上的3条命令结果和前面的一样
和之前的结果的区别在于当前返回的是一个DataFrame类型
‘‘‘

#索引的方法和属性
# print(grouped1.index)

pandas索引和复合索引

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt

df = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list(‘fax‘))
# print(df)

‘‘‘
   f  a  x
0  0  1  2
1  3  4  5
2  6  7  8
‘‘‘

#索引的方法和属性

#简单的索引操作：
#•获取index： df.index
# print(df.index) #RangeIndex(start=0, stop=3, step=1)

# •指定index ： df.index = []
# df.index = [‘m‘,‘n‘,‘o‘]
# print(df)
‘‘‘
   f  a  x
m  0  1  2
n  3  4  5
o  6  7  8
‘‘‘

# •重新设置index : df.reindex(list("abc"))

# print(df.reindex(list(‘abc‘)))
‘‘‘
    f   a   x
a NaN NaN NaN
b NaN NaN NaN
c NaN NaN NaN
‘‘‘

# •指定某一列作为index ： df.set_index("a")
# print(df.set_index(‘a‘)) #指定a列为行索引
‘‘‘
   f  x
a      
1  0  2
4  3  5
7  6  8
‘‘‘
#如果仍然想保留原a列，df.set_index("a",drop=False)
# print(df.set_index(‘a‘,drop=False))
‘‘‘
   f  a  x
a         
1  0  1  2
4  3  4  5
7  6  7  8
‘‘‘

# •返回index的唯一值： df.set_index("Country").index.unique()
df1 = pd.DataFrame(np.ones((3,4)),index=list(‘ABC‘),columns=list(‘abcd‘))
# print(df1)
‘‘‘
     a    b    c    d
A  1.0  1.0  1.0  1.0
B  1.0  1.0  1.0  1.0
C  1.0  1.0  1.0  1.0
‘‘‘
# print(df1[‘a‘].unique()) #[1.]
# print(df1.set_index(‘a‘).index.unique())#Float64Index([1.0], dtype=‘float64‘, name=‘a‘)
#
# print(len(df1.set_index(‘a‘).index.unique())) # 1



#复合索引


# a = df1.set_index([‘a‘,‘b‘,‘c‘],drop=False)
#
# print(a)
‘‘‘
               a    b    c    d
a   b   c                      
1.0 1.0 1.0  1.0  1.0  1.0  1.0
        1.0  1.0  1.0  1.0  1.0
        1.0  1.0  1.0  1.0  1.0
‘‘‘

dict = pd.DataFrame({‘a‘: range(7),‘b‘: range(7, 0, -1),‘c‘:
[‘one‘,‘one‘,‘one‘,‘two‘,‘two‘,‘two‘, ‘two‘],‘d‘: list("hjklmno")})
# print(dict)
‘‘‘
   a  b    c  d
0  0  7  one  h
1  1  6  one  j
2  2  5  one  k
3  3  4  two  l
4  4  3  two  m
5  5  2  two  n
6  6  1  two  o
‘‘‘
b = dict.set_index([‘c‘,‘d‘],drop=False)
# print(b)
‘‘‘
       a  b    c  d
c   d              
one h  0  7  one  h
    j  1  6  one  j
    k  2  5  one  k
two l  3  4  two  l
    m  4  3  two  m
    n  5  2  two  n
    o  6  1  two  o
‘‘‘

c = b[‘a‘]
# print(c,type(c))
‘‘‘
c    d
one  h    0
     j    1
     k    2
two  l    3
     m    4
     n    5
     o    6
Name: a, dtype: int64 <class ‘pandas.core.series.Series‘>
‘‘‘
# print(c[‘one‘][‘j‘]) # 1

d = dict.set_index([‘d‘,‘c‘])[‘a‘]
# print(d)
‘‘‘
d  c  
h  one    0
j  one    1
k  one    2
l  two    3
m  two    4
n  two    5
o  two    6
Name: a, dtype: int64
‘‘‘
# print(d.index)
‘‘‘
MultiIndex([(‘h‘, ‘one‘),
            (‘j‘, ‘one‘),
            (‘k‘, ‘one‘),
            (‘l‘, ‘two‘),
            (‘m‘, ‘two‘),
            (‘n‘, ‘two‘),
            (‘o‘, ‘two‘)],
           names=[‘d‘, ‘c‘])
‘‘‘

#取d中one对应的值，swaplevel交换里外层索引顺序


# print(d.swaplevel()[‘one‘])
‘‘‘
d
h    0
j    1
k    2
Name: a, dtype: int64
‘‘‘

# print(b.loc[‘one‘].loc[‘h‘])
‘‘‘
a      0
b      7
c    one
d      h
‘‘‘

# print(b.swaplevel().loc[‘h‘])
‘‘‘
     a  b    c  d
c                
one  0  7  one  h
‘‘‘

练习

使用matplotlib呈现出店铺总数排名前10的国家

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt
import codecs

file_path = ‘./starbucks_store_worldwide.csv‘
#读取文件，并解析内容，但是有些文件的格式不是utf-8，导致读取失败，无法继续
#转化成utf-8
file_path= codecs.open(file_path, ‘r‘,encoding= u‘utf-8‘,errors=‘ignore‘)
df = pd.read_csv(file_path)

# print(df.head(1))
# print(df.info())

# #准备数据
#
# data1 = df.groupby(by=‘Country‘).count()[‘Brand‘].sort_values(ascending=False)[:10]
# # print(data1)
#
# x = data1.index
# y = data1.values
#
# #画图
# plt.figure(figsize=(20,8),dpi = 80)
# plt.bar(range(len(x)),y)
# plt.xticks(range(len(x)),x)
#
# plt.savefig(‘./11店铺总数排名前10的国家.png‘)
# plt.show()

使用matplotlib呈现出中国每个城市的店铺数量

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt
import codecs
from matplotlib import font_manager

# my_font = font_manager.FontProperties(fname="/Library/Fonts/Songti.ttc")
file_path = ‘./starbucks_store_worldwide.csv‘
#读取文件，并解析内容，但是有些文件的格式不是utf-8，导致读取失败，无法继续
#转化成utf-8
file_path= codecs.open(file_path, ‘r‘,encoding= u‘utf-8‘,errors=‘ignore‘)
df = pd.read_csv(file_path)

# print(df.head(1))
# print(df.info())

#使用matplotlib呈现出中国每个城市的店铺数量

df = df[df[‘Country‘] == ‘CN‘]
print(df.head(1))

data1 = df.groupby(by=‘City‘).count()[‘Brand‘].sort_values(ascending=False)[:25]
x = data1.index
y = data1.values

#画图
plt.figure(figsize=(20,8),dpi = 80)
plt.bar(range(len(x)),y,width= 0.3,color = ‘y‘)
plt.xticks(range(len(x)),x,fontproperties = my_font)

# plt.savefig(‘./11中国每个城市的店铺数量.png‘)
plt.show()

现有全球排名靠前的10000本书数据，完成以下内容

不同年份书的数量

import pandas as pd
from matplotlib import pyplot as plt

file_path = ‘./books.csv‘

df = pd.read_csv(file_path)
# print(df.head(2))
# print(df.info())

#去掉缺失出版年份的书的数据
data1 = df[pd.notnull(df["original_publication_year"])]
grouped = data1.groupby(by=‘original_publication_year‘).count()["title"]
# print(grouped)

x = grouped.index
y = grouped.values

#画图
plt.figure(figsize = (20,8),dpi = 80)
plt.plot(range(len(x)),y)
# print(len(x))

plt.xticks(list(range(len(x)))[::10],x[::10].astype(int),rotation = 45)

plt.savefig(‘./12不同年份书数量.png‘)
plt.show()

不同年份书的平均评分情况

import pandas as pd
from matplotlib import pyplot as plt

file_path = ‘./books.csv‘

df = pd.read_csv(file_path)
# print(df.head(2))
# print(df.info())
data1 = df[pd.notnull(df["original_publication_year"])]

#为什么groupby(by =data1[‘original_publication_year‘]里面与上面不同，by加了data1
#因为这个合并的数据是data1[‘average_rating‘]，这里并没有年份数据，
# 所以不能直接by = ‘original_publication_year‘
#要在data1中按年份合并


grouped = data1[‘average_rating‘].groupby(by =data1[‘original_publication_year‘]).mean()
# print(grouped)

x = grouped.index
y = grouped.values

#画图
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(x)),y)
# print(len(x))

plt.xticks(list(range(len(x)))[::10],x[::10].astype(int),rotation=45)

plt.savefig(‘./12不同年份书平均评分.png‘)
plt.show()

PeriodIndex-pm2.5

# coding=utf-8


‘‘‘
之前所学习的DatetimeIndex可以理解为时间戳
那么现在我们要学习的PeriodIndex可以理解为时间段
‘‘‘
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#北京PM2.5随时间的变化情况
file_path = "./BeijingPM20100101_20151231.csv"
df = pd.read_csv(file_path)

# print(df.head())
# print(df.info())

#把分开的时间字符串通过PeriodIndex的方法转化为pandas的时间类型
period = pd.PeriodIndex(year = df[‘year‘],month = df[‘month‘],
               day = df[‘day‘],hour = df[‘hour‘],freq=‘H‘)

# print(period)
df[‘datatime‘] = period
# print(df.head(10))
#把datatime设置为索引
df.set_index(‘datatime‘,inplace = True)

#进行降采样
df = df.resample(‘7D‘).mean()

#处理缺失数据
# print(df[‘PM_US Post‘])
‘‘‘
datatime
2010-01-01 00:00      NaN
2010-01-01 01:00      NaN
2010-01-01 02:00      NaN
2010-01-01 03:00      NaN
2010-01-01 04:00      NaN
                    ...  
2015-12-31 19:00    133.0
2015-12-31 20:00    169.0
2015-12-31 21:00    203.0
2015-12-31 22:00    212.0
2015-12-31 23:00    235.0
Freq: H, Name: PM_US Post, Length: 52584, dtype: float64
‘‘‘
#删除含有NaN值的行
data = df[‘PM_US Post‘]
data_china = df[‘PM_Dongsi‘]

x = data.index
y = data.values

x_china = data_china.index
y_china = data_china.values



plt.figure(figsize= (20,8),dpi=80)

plt.plot(range(len(x)),y,label = ‘US_Post‘)
plt.plot(range(len(x_china)),y_china,label = ‘CN_Post‘)

print(len(x),len(x_china))

plt.xticks(range(0,len(x),10),list(x)[::10],rotation = 45)

plt.legend(loc = ‘best‘)

plt.savefig(‘./17北京PM2.5随时间的变化情况.png‘)
plt.show()

Pandas学习笔记

原文：https://www.cnblogs.com/xcf20190825/p/12148889.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)