import pandas as pd
import numpy as np
?
#创建数组
# t1 = pd.Series([1,23,44,5,56,6])
# print(t1)
‘‘‘
0 1
1 23
2 44
3 5
4 56
5 6
‘‘‘
?
#指定索引值
# t2 = pd.Series(np.arange(5),index=list(‘abcde‘))
# print(t2)
‘‘‘
a 0
b 1
c 2
d 3
e 4
dtype: int32
‘‘‘
?
#通过字典创建
?
# dict = {‘name‘:‘tony‘,‘age‘:23,‘sex‘:"fale",‘tel‘:10012}
# t3 = pd.Series(dict)
# print(t3)
?
‘‘‘
name tony
age 23
sex fale
tel 10012
dtype: object #对象,代表数组中含有字符串
‘‘‘
?
#pandas切片和索引
?
# print(t3[1]) # 23 按默认索引值取
# print(t3[‘age‘]) # 23 按字典键值取
?
#取连续多行
?
#print(t3[:2])
‘‘‘
name tony
age 23
dtype: object
‘‘‘
?
#取不连续多行
# print(t3[[1,2]])
‘‘‘
age 23
sex fale
dtype: object
‘‘‘
# print(t3[[‘age‘,‘tel‘]])
‘‘‘
age 23
tel 10012
dtype: object
‘‘‘
?
#取满足条件的数组
# t4 = pd.Series([1,5,66,23,44,5,56,6])
# print(t4[t4>10])
‘‘‘
2 66
3 23
4 44
6 56
‘‘‘
?
#对于一个陌生的series类型, 我们如何知道他的索引和具体的值呢
dict = {‘name‘:‘tony‘,‘age‘:23,‘sex‘:"fale",‘tel‘:10012}
t3 = pd.Series(dict)
?
#索引值
# print(t3.index) # Index([‘name‘, ‘age‘, ‘sex‘, ‘tel‘], dtype=‘object‘)
# for i in t3.index:
# print(i)
‘‘‘
name
age
sex
tel
‘‘‘
# print(list(t3.index)[:2]) #[‘name‘, ‘age‘]
?
?
#具体值
# print(t3.values) # [‘tony‘ 23 ‘fale‘ 10012]
# print(type(t3.values)) #<class ‘numpy.ndarray‘>
import pandas as pd
import numpy as np
?
# 读取美国YouTube.csv文件内容
# df = pd.read_csv(‘./美国YouTube.csv‘)
#将内容likes这一列进行从大到小排序
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)
?
#取likes数组大于10000小于100000的对象
# print(df[(10000<df[‘likes‘])&(df[‘likes‘]<100000)])
?
##取likes数组大于10000且名字的长度大于4的目标对象
# print(df[(10000<df[‘likes‘])&(df[‘name‘].str.len()>4)])
‘‘‘
name view_count likes dislikes comment_count
6 ttttt 2491513 426184 8606 27555
2 ttttt 2744190 230435 5618 21527
146 ttttt 2455037 210879 4738 19801
175 ggggbr 556577 126773 1690 12318
86 ggggbr 673533 91355 1482 8204
44 ttttt 1839655 86827 3182 9375
53 ttttt 792211 83095 2407 2830
13 adsfxgch 494522 61940 1522 6246
10 ttttt 312876 59976 930 3459
5 ggggbr 426563 52433 1214 1602
27 ttttt 359408 47662 588 1373
22 ggggbr 449180 43051 456 1660
18 ggggbr 1307564 42445 18635 17629
47 adsfxgch 233780 37787 420 1380
23 ttttt 178437 34479 401 3843
162 ggggbr 228233 34224 285 1520
121 ttttt 474103 30240 3334 9357
19 ttttt 206338 27231 520 2685
78 ttttt 1464286 24908 1880 818
142 ttttt 91334 22402 484 1968
70 ttttt 100288 19004 148 1202
43 ggggbr 133892 18874 205 1587
192 ggggbr 53804 13622 42 767
1 ggggbr 764890 13586 364 1249
112 ttttt 94626 12942 218 847
73 ggggbr 151609 11916 117 618
171 ggggbr 122858 10916 285 3381
‘‘‘
import pandas as pd
import numpy as np
?
#DataFrame
?
# t1 = pd.DataFrame(np.arange(12).reshape((3,4)))
# print(t1)
‘‘‘
0 1 2 3 #列索引
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
?
DataFrame对象既有行索引, 又有列索引
行索引, 表明不同行, 横向索引, 叫index, 0轴, axis=0
列索引, 表名不同列, 纵向索引, 叫columns, 1轴,axis=1
‘‘‘
?
# t2 = pd.DataFrame(np.arange(12).reshape((3,4)),index=list(‘abc‘),columns=list(‘ABCD‘))
# print(t2)
‘‘‘
A B C D
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
‘‘‘
?
# dict = {‘name‘:[‘tony‘,‘alex‘],‘age‘:[23,12],‘sex‘:["fale",‘female‘],‘tel‘:[10012,10010]}
# t3 = pd.DataFrame(dict)
# print(t3)
‘‘‘
name age sex tel
0 tony 23 fale 10012
1 alex 12 female 10010
‘‘‘
?
dict2 = [{‘name‘:‘tony1‘,‘age‘:21,‘sex‘:"fale1",‘tel‘:10011},
{‘name‘:‘tony2‘,‘age‘:22,‘sex‘:"fale2",},
{‘name‘:‘tony3‘,‘age‘:23,‘tel‘:10013}]
t4 = pd.DataFrame(dict2)
# print(t4) #如果哦对应值缺失,则显示NaN
‘‘‘
name age sex tel
0 tony1 21 fale1 10011.0
1 tony2 22 fale2 NaN
2 tony3 23 NaN 10013.0
‘‘‘
# print(t4.shape) # (3, 4)
?
# print(t4.dtypes) # 每一列的数据类型
‘‘‘
name object
age int64
sex object
tel float64
‘‘‘
?
# print(t4.ndim) # 2 数据维度
?
# print(t4.index) # RangeIndex(start=0, stop=3, step=1) ,行索引
#
# print(t4.columns) # Index([‘name‘, ‘age‘, ‘sex‘, ‘tel‘], dtype=‘object‘) 列索引
#
# print(t4.values) #对象值
‘‘‘
[[‘tony1‘ 21 ‘fale1‘ 10011.0]
[‘tony2‘ 22 ‘fale2‘ nan]
[‘tony3‘ 23 nan 10013.0]]
‘‘‘
#显示前几行
# print(t4.head(2))
‘‘‘
name age sex tel
0 tony1 21 fale1 10011.0
1 tony2 22 fale2 NaN
‘‘‘
?
#显示尾部几行
# print(t4.tail(2))
‘‘‘
name age sex tel
1 tony2 22 fale2 NaN
2 tony3 23 NaN 10013.0
‘‘‘
?
#显示t4的整体信息
# print(t4.info())
‘‘‘
<class ‘pandas.core.frame.DataFrame‘> #
RangeIndex: 3 entries, 0 to 2 #行信息
Data columns (total 4 columns): #列信息
name 3 non-null object
age 3 non-null int64
sex 2 non-null object
tel 2 non-null float64
dtypes: float64(1), int64(1), object(2) #出现的数据类型种类及次数
memory usage: 224.0+ bytes #储存大小
None
‘‘‘
?
#对于数组中为数字的数据类型列进行快速的统计
# print(t4.describe()) #统计t4中的age和tel两个数字类型列的信息
‘‘‘
age tel
count 3.0 2.000000
mean 22.0 10012.000000
std 1.0 1.414214
min 21.0 10011.000000
25% 21.5 10011.500000
50% 22.0 10012.000000
75% 22.5 10012.500000
max 23.0 10013.000000
‘‘‘
?
df = pd.read_csv(‘./美国YouTube.csv‘) #读取当前目录下的文件
# print(df)
# print(df.head()) #默认读取前5行
# print(df.info())
‘‘‘
<class ‘pandas.core.frame.DataFrame‘>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
view_count 200 non-null int64
likes 200 non-null int64
dislikes 200 non-null int64
comment_count 200 non-null int64
dtypes: int64(4)
memory usage: 6.4 KB
None
‘‘‘
?
#dataFrame中排序的方法
#将likes这一列进行排序,默认是升序,ascending=False为降序
?
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)
import pandas as pd
import numpy as np
?
#读取美国YouTube.csv文件内容
# df = pd.read_csv(‘./美国YouTube.csv‘)
# #将内容likes这一列进行从大到小排序
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)
?
?
#取前20行
#
# print(df[:20])
?
#取某列,直接把列索引填入即可
# print(df[‘likes‘])
?
#取前20行的likes列值
# print(df[:20][‘likes‘])
?
?
?
#t.loc通过标签索引行数据
?
t = pd.DataFrame(np.arange(12).reshape((3,4)),index=list(‘abc‘),columns=list(‘WXYZ‘))
# print(t)
‘‘‘
W X Y Z
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
‘‘‘
#取a行
# print(t.loc[‘a‘])
# #或
# print(t.loc[‘a‘,:])
?
#取Z列
# print(t.loc[:,‘Z‘])
?
#取多行和多列
?
#取不连续多行
# print(t.loc[[‘a‘,‘c‘]]) #两个中括号
‘‘‘
W X Y Z
a 0 1 2 3
c 8 9 10 11
‘‘‘
?
#取多列
# print(t.loc[:,[‘W‘,‘Y‘]]) #两个中括号
‘‘‘
W Y
a 0 2
b 4 6
c 8 10
‘‘‘
?
#同时取多行和多列
# print(t.loc[[‘a‘,‘c‘],[‘W‘,‘Y‘]]) #两个中括号
‘‘‘
W Y
a 0 2
c 8 10
‘‘‘
?
# print(t.loc[‘a‘:‘c‘,[‘W‘,‘Y‘]]) # ‘a‘:‘c‘表示从a到c行都可以取到
‘‘‘
W Y
a 0 2
b 4 6
c 8 10
‘‘‘
?
#t.iloc通过位置索引行数据
?
# #索引值为1的行
# print(t.iloc[1,:])
?
#取不连续多行
# print(t.iloc[[0,2]])
‘‘‘
W X Y Z
a 0 1 2 3
c 8 9 10 11
‘‘‘
?
#索引值为1的列
# print(t.iloc[:,1])
?
#取多列
# print(t.iloc[:,[1,3]])
‘‘‘
X Z
a 1 3
b 5 7
c 9 11
‘‘‘
?
#取多行多列
# print(t.iloc[[1,2],[1,3]])
‘‘‘
X Z
b 5 7
c 9 11
‘‘‘
?
# print(t.iloc[1:,:2])
‘‘‘
W X
b 4 5
c 8 9
‘‘‘
t.iloc[1:,:2]=np.nan
print(t)
‘‘‘
W X Y Z
a 0.0 1.0 2 3
b NaN NaN 6 7
c NaN NaN 10 11
‘‘‘
import pandas as pd
import numpy as np
?
# 读取美国YouTube.csv文件内容
# df = pd.read_csv(‘./美国YouTube.csv‘)
#将内容likes这一列进行从大到小排序
# df = df.sort_values(by=‘likes‘,ascending=False)
# print(df)
?
#取likes数组大于10000小于100000的对象
# print(df[(10000<df[‘likes‘])&(df[‘likes‘]<100000)])
?
##取likes数组大于10000且名字的长度大于4的目标对象
# print(df[(10000<df[‘likes‘])&(df[‘name‘].str.len()>4)])
‘‘‘
name view_count likes dislikes comment_count
6 ttttt 2491513 426184 8606 27555
2 ttttt 2744190 230435 5618 21527
146 ttttt 2455037 210879 4738 19801
175 ggggbr 556577 126773 1690 12318
86 ggggbr 673533 91355 1482 8204
44 ttttt 1839655 86827 3182 9375
53 ttttt 792211 83095 2407 2830
13 adsfxgch 494522 61940 1522 6246
10 ttttt 312876 59976 930 3459
5 ggggbr 426563 52433 1214 1602
27 ttttt 359408 47662 588 1373
22 ggggbr 449180 43051 456 1660
18 ggggbr 1307564 42445 18635 17629
47 adsfxgch 233780 37787 420 1380
23 ttttt 178437 34479 401 3843
162 ggggbr 228233 34224 285 1520
121 ttttt 474103 30240 3334 9357
19 ttttt 206338 27231 520 2685
78 ttttt 1464286 24908 1880 818
142 ttttt 91334 22402 484 1968
70 ttttt 100288 19004 148 1202
43 ggggbr 133892 18874 205 1587
192 ggggbr 53804 13622 42 767
1 ggggbr 764890 13586 364 1249
112 ttttt 94626 12942 218 847
73 ggggbr 151609 11916 117 618
171 ggggbr 122858 10916 285 3381
‘‘‘
import pandas as pd
import numpy as np
?
t = pd.DataFrame(np.arange(24).reshape((4,6)),index=list(‘ABCD‘),columns=list(‘UVWXYZ‘))
t.iloc[1:,:2]=np.nan
t.iloc[1,5]=0
# print(t)
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
B NaN NaN 8 9 10 0
C NaN NaN 14 15 16 17
D NaN NaN 20 21 22 23
?
?
?
我们的数据缺失通常有两种情况:
一种就是空, None等, 在pandas是NaN(和np.nan一样)
另一种是我们让其为0, 蓝色框中
‘‘‘
?
#判断数据是否为NaN: pd.isnull(df),pd.notnull(df)
?
# print(pd.isnull(t))
‘‘‘
A False False False False False False
B True True False False False False
C True True False False False False
D True True False False False False
‘‘‘
# print(pd.notnull(t))
‘‘‘
A True True True True True True
B False False True True True True
C False False True True True True
D False False True True True True
‘‘‘
?
#取U列中,不为NaN的行
# print(t[pd.notnull(t[‘U‘])])
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
‘‘‘
?
#处理方式1: 删除NaN所在的行列dropna (axis=0, how=‘any‘, inplace=False)
?
#删除NaN所在的行,默认any,只要有一行有NaN就删除该行
# print(t.dropna(axis=0))
# ‘‘‘
# U V W X Y Z
# A 0.0 1.0 2 3 4 5
# ‘‘‘
?
# print(t.dropna(axis=0,how=‘any‘))
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
‘‘‘
?
#只有某一行所有值为NaN就删除该行
# print(t.dropna(axis=0,how=‘all‘))
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
B NaN NaN 8 9 10 0
C NaN NaN 14 15 16 17
D NaN NaN 20 21 22 23
‘‘‘
?
?
# inplace是否进行原地修改
# t.dropna(axis=0,how=‘any‘,inplace=True)
# print(t)
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
‘‘‘
?
#填充数据fillna
?
#将NaN都填充为5
# print(t.fillna(5))
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
B 5.0 5.0 8 9 10 0
C 5.0 5.0 14 15 16 17
D 5.0 5.0 20 21 22 23
‘‘‘
?
#将NaN都填充为均值,此均值为对应的除NaN以外的该列的其他数字均值
#还可填充其他如t.fillna(t.median())
# print(t.fillna(t.mean()))
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
B 0.0 1.0 8 9 10 0
C 0.0 1.0 14 15 16 17
D 0.0 1.0 20 21 22 23
?
U列只有一个非NaN值为0,则此平均值为0
V列只有一个非NaN值为1,则此平均值为1
所以U列填充0,V列填充1
‘‘‘
?
#只想填充V列的NaN值
# t[‘V‘] = t[‘V‘].fillna(t[‘V‘].mean())
# print(t)
‘‘‘
U V W X Y Z
A 0.0 1.0 2 3 4 5
B NaN 1.0 8 9 10 0
C NaN 1.0 14 15 16 17
D NaN 1.0 20 21 22 23
‘‘‘
?
?
‘‘‘
处理为0的数据: t[t==0]=np.nan
当然并不是每次为0的数据都需要处理
计算平均值等情况, nan是不参与计算的, 但是0会
‘‘‘
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
?
df = pd.read_csv(‘./IMDB-Movie-Data.csv‘)
# print(df.info())
?
# print(df.head(1))
?
#获取电影平均评分
# print(df[‘Rating‘].mean()) # 6.723200000000003
?
#获取导演人数
# print(len(set(df[‘Director‘].tolist()))) #644
# #获取不重复的导演名字
# print(df[‘Director‘].unique())
# print(len(df[‘Director‘].unique())) #644
?
?
#获取演员人数
?
# tmp_actor_list = df[‘Actors‘].str.split(‘,‘).tolist()
# #for j in tmp_actor_list:
## for i in j:
# # print(i)
# actor_list = [i for j in tmp_actor_list for i in j]
# actors_nums = len(set(actor_list))
# print(actors_nums) #2394
?
?
# max_runtime = df[‘Runtime (Minutes)‘].max()
# min_runtime = df[‘Runtime (Minutes)‘].min()
# mean_runtime = df[‘Runtime (Minutes)‘].mean()
# median_runtime = df[‘Runtime (Minutes)‘].median()
# print(max_runtime) #191
# print(min_runtime) #66
# print(mean_runtime) # 113.172
# print(median_runtime) #111.0
?
‘‘‘
1.对于这一组电影数据, 如果我们想rating(电影评分), runtime(时长)的分布情况,
应该如何呈现数据?
‘‘‘
# #时长分析
#
# #选择图形,直方图
# #找出数据的最大最小值
#
# runtime_data = df[‘Runtime (Minutes)‘].values
#
# max_runtime = runtime_data.max()
#
# min_runtime = runtime_data.min()
#
# #计算组数
# # print(max_runtime - min_runtime)
# num_bins = (max_runtime - min_runtime)//5 #商取整
# # print(num_bins)
#
# plt.figure(figsize=(20,8),dpi = 80)
# plt.hist(runtime_data,num_bins)
#
# #设置x轴刻度
# plt.xticks(range(min_runtime,max_runtime+5,5))
# plt.savefig(‘./07时长分布.png‘)
# plt.show()
?
?
#rating(电影评分)分析
?
#时长分析
?
#选择图形,直方图
#找出数据的最大最小值
?
# rating_data = df[‘Rating‘].values
#
# max_rating = rating_data.max()
#
# min_rating = rating_data.min()
#
# #计算组数
# # print(max_rating ,min_rating) #9.0 1.9
#
# num_bin_list = [1.6]
# sum = 1.6
# for i in range(11):
# sum +=0.5
# num_bin_list.append(sum)
# print(num_bin_list)
#
#
# plt.figure(figsize=(20,8),dpi = 80)
# plt.hist(rating_data,num_bin_list)
#
# #设置x轴刻度
#
#
# plt.savefig(‘./07评分分析.png‘)
# plt.show()
?
‘‘‘
2.对于这一组电影数据, 如果我们希望统计电影分类(genre)的情况, 应该如何处
理数据?
‘‘‘
#即每种类型的电影有多少部
?
#选择条形图
# print(df[‘Genre‘])
#
# #1.统计分类的列表
# temp_list = df[‘Genre‘].str.split(‘,‘).tolist()
# # print(temp_list)
#
# # for j in temp_list:
# # for i in j:
# # print(i)
#
# genre_list = list(set([i for j in temp_list for i in j]))
# # print(genre_list)
#
# #构造全为0的数组
#
# zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),
# columns=genre_list)
# # print(zeros_df)
#
# #给每个电影出现分类的位置赋值为1
#
# for i in range(df.shape[0]):
# #zeros_df.loc[0,[Action,Adventure,Sci-Fi]] = 1
# zeros_df.loc[i,temp_list[i]] = 1
# # print(zeros_df.head(5))
# # print(zeros_df)
#
# #统计每个类目电影的数量
# genre_count = zeros_df.sum(axis = 0)
# # print(genre_count)
#
# #排序
# genre_count = genre_count.sort_values()
# # print(genre_count)
# x = genre_count.index
# y = genre_count.values
#
#
# #画图
# plt.figure(figsize=(20,8),dpi=80)
#
# plt.bar(range(len(x)),y,color = ‘y‘,alpha = 0.6)
# plt.xticks(range(len(x)),x)
#
# plt.savefig(‘./07各类型电影数量.png‘)
# plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
?
#join:默认情况下他是把行索引相同的数据合并到一起
#
# t1 = pd.DataFrame(np.zeros((2,5)),index=list(‘AB‘),columns=list(‘VWXYZ‘))
# print(t1)
# ‘‘‘
# V W X Y Z
# A 0.0 0.0 0.0 0.0 0.0
# B 0.0 0.0 0.0 0.0 0.0
# ‘‘‘
#
# t2 = pd.DataFrame(np.ones((3,4)),index=list(‘ABC‘))
# print(t2)
# ‘‘‘
# 0 1 2 3
# A 1.0 1.0 1.0 1.0
# B 1.0 1.0 1.0 1.0
# C 1.0 1.0 1.0 1.0
# ‘‘‘
#
# print(t2.join(t1))
# ‘‘‘
# 0 1 2 3 V W X Y Z
# A 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
# B 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
# C 1.0 1.0 1.0 1.0 NaN NaN NaN NaN NaN
# ‘‘‘
# print(t1.join(t2))
# ‘‘‘
# V W X Y Z 0 1 2 3
# A 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# B 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# ‘‘‘
?
?
# #merge:默认情况下他是把列索引相同的数据合并到一起
#
#
# df1 = pd.DataFrame(np.ones((2,4)),index=list(‘AB‘),columns=list(‘abcd‘))
# # print(df1)
# ‘‘‘
# a b c d
# A 1.0 1.0 1.0 1.0
# B 1.0 1.0 1.0 1.0
# ‘‘‘
#
# df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list(‘fax‘))
# # print(df3)
# ‘‘‘
# f a x
# 0 0 1 2
# 1 3 4 5
# 2 6 7 8
# ‘‘‘
#
# # print(df1.merge(df3,on=‘a‘)) #按照a列进行合并,df1中a列有2个值和df3中的a值相同
# ‘‘‘
# a b c d f x
# 0 1.0 1.0 1.0 1.0 0 2
# 1 1.0 1.0 1.0 1.0 0 2
# ‘‘‘
# df1.loc[‘A‘,‘a‘] = 100
# # print(df1)
# ‘‘‘
# a b c d
# A 100.0 1.0 1.0 1.0
# B 1.0 1.0 1.0 1.0
# ‘‘‘
# # print(df1.merge(df3,on=‘a‘)) #按照a列进行合并,df1中a列有1个值和df3中的a值相同
# ‘‘‘
# a b c d f x
# 0 1.0 1.0 1.0 1.0 0 2
# ‘‘‘
#
# #外连接,并集,NaN补全
# # print(df1.merge(df3,on=‘a‘,how=‘outer‘))
# ‘‘‘
# a b c d f x
# 0 100.0 1.0 1.0 1.0 NaN NaN
# 1 1.0 1.0 1.0 1.0 0.0 2.0
# 2 4.0 NaN NaN NaN 3.0 5.0
# 3 7.0 NaN NaN NaN 6.0 8.0
# ‘‘‘
#
# #左连接,以左边df1为基准,NaN补全
# # print(df1.merge(df3,on=‘a‘,how=‘left‘))
# ‘‘‘
# a b c d f x
# 0 100.0 1.0 1.0 1.0 NaN NaN
# 1 1.0 1.0 1.0 1.0 0.0 2.0
# ‘‘‘
#
# #右连接,以右边df3为基准,NaN补全
# print(df1.merge(df3,on=‘a‘,how=‘right‘))
# ‘‘‘
# a b c d f x
# 0 1.0 1.0 1.0 1.0 0 2
# 1 4.0 NaN NaN NaN 3 5
# 2 7.0 NaN NaN NaN 6 8
# ‘‘‘
import pandas as pd import numpy as np import matplotlib.pyplot as plt file_path = ‘./starbucks_store_worldwide.csv‘ df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) ##grouped = df.groupby(by=‘Country‘) #按照国家进行分组 # print(grouped) #DataFrameGroupBy #可以进行遍历 # for i in grouped: #对国家进行遍历,每个grouped是一个国家的相关数据 # print(i) # print("*"*100) #调用聚合方法 # print(grouped.count()) #计算每个国家的数量 # country_count = grouped[‘Brand‘].count() # print(country_count[‘US‘]) #美国星巴克数量 # print(country_count[‘CN‘]) #中国星巴克数量 #统计中国每个省份的星巴克数量 # china_data = df[df[‘Country‘] == ‘CN‘] # # print(china_data) # # grouped = china_data.groupby(by = ‘State/Province‘).count()[‘Brand‘] # print(grouped) #数据按照多个条件进行分组,返回Series #先按国家分组,再按城市分组 # grouped = df[‘Brand‘].groupby(by=[df[‘Country‘],df[‘State/Province‘]]).count() # print(grouped,type(grouped)) ‘‘‘ AD 7 1 AE AJ 2 AZ 48 DU 82 FU 2 .. US WV 25 WY 23 VN HN 6 SG 19 ZA GT 3 Name: Brand, Length: 545, dtype: int64 <class ‘pandas.core.series.Series‘> ‘‘‘ #数据按照多个条件进行分组,返回DataFrame, # 多加了一个方括号*** grouped1 = df[[‘Brand‘]].groupby(by=[df[‘Country‘],df[‘State/Province‘]]).count() grouped2 = df.groupby(by=[df[‘Country‘],df[‘State/Province‘]])[[‘Brand‘]].count() grouped3 = df[[‘Brand‘]].groupby(by=[df[‘Country‘],df[‘State/Province‘]]).count()[[‘Brand‘]] # print(grouped1,type(grouped1)) # print(grouped2,type(grouped2)) # print(grouped3,type(grouped3)) #[545 rows x 1 columns] <class ‘pandas.core.frame.DataFrame‘> ‘‘‘ 以上的3条命令结果和前面的一样 和之前的结果的区别在于当前返回的是一个DataFrame类型 ‘‘‘ #索引的方法和属性 # print(grouped1.index)
import pandas as pd import numpy as np import matplotlib.pyplot as plt df = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list(‘fax‘)) # print(df) ‘‘‘ f a x 0 0 1 2 1 3 4 5 2 6 7 8 ‘‘‘ #索引的方法和属性 #简单的索引操作: #•获取index: df.index # print(df.index) #RangeIndex(start=0, stop=3, step=1) # •指定index : df.index = [] # df.index = [‘m‘,‘n‘,‘o‘] # print(df) ‘‘‘ f a x m 0 1 2 n 3 4 5 o 6 7 8 ‘‘‘ # •重新设置index : df.reindex(list("abc")) # print(df.reindex(list(‘abc‘))) ‘‘‘ f a x a NaN NaN NaN b NaN NaN NaN c NaN NaN NaN ‘‘‘ # •指定某一列作为index : df.set_index("a") # print(df.set_index(‘a‘)) #指定a列为行索引 ‘‘‘ f x a 1 0 2 4 3 5 7 6 8 ‘‘‘ #如果仍然想保留原a列,df.set_index("a",drop=False) # print(df.set_index(‘a‘,drop=False)) ‘‘‘ f a x a 1 0 1 2 4 3 4 5 7 6 7 8 ‘‘‘ # •返回index的唯一值: df.set_index("Country").index.unique() df1 = pd.DataFrame(np.ones((3,4)),index=list(‘ABC‘),columns=list(‘abcd‘)) # print(df1) ‘‘‘ a b c d A 1.0 1.0 1.0 1.0 B 1.0 1.0 1.0 1.0 C 1.0 1.0 1.0 1.0 ‘‘‘ # print(df1[‘a‘].unique()) #[1.] # print(df1.set_index(‘a‘).index.unique())#Float64Index([1.0], dtype=‘float64‘, name=‘a‘) # # print(len(df1.set_index(‘a‘).index.unique())) # 1 #复合索引 # a = df1.set_index([‘a‘,‘b‘,‘c‘],drop=False) # # print(a) ‘‘‘ a b c d a b c 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 ‘‘‘ dict = pd.DataFrame({‘a‘: range(7),‘b‘: range(7, 0, -1),‘c‘: [‘one‘,‘one‘,‘one‘,‘two‘,‘two‘,‘two‘, ‘two‘],‘d‘: list("hjklmno")}) # print(dict) ‘‘‘ a b c d 0 0 7 one h 1 1 6 one j 2 2 5 one k 3 3 4 two l 4 4 3 two m 5 5 2 two n 6 6 1 two o ‘‘‘ b = dict.set_index([‘c‘,‘d‘],drop=False) # print(b) ‘‘‘ a b c d c d one h 0 7 one h j 1 6 one j k 2 5 one k two l 3 4 two l m 4 3 two m n 5 2 two n o 6 1 two o ‘‘‘ c = b[‘a‘] # print(c,type(c)) ‘‘‘ c d one h 0 j 1 k 2 two l 3 m 4 n 5 o 6 Name: a, dtype: int64 <class ‘pandas.core.series.Series‘> ‘‘‘ # print(c[‘one‘][‘j‘]) # 1 d = dict.set_index([‘d‘,‘c‘])[‘a‘] # print(d) ‘‘‘ d c h one 0 j one 1 k one 2 l two 3 m two 4 n two 5 o two 6 Name: a, dtype: int64 ‘‘‘ # print(d.index) ‘‘‘ MultiIndex([(‘h‘, ‘one‘), (‘j‘, ‘one‘), (‘k‘, ‘one‘), (‘l‘, ‘two‘), (‘m‘, ‘two‘), (‘n‘, ‘two‘), (‘o‘, ‘two‘)], names=[‘d‘, ‘c‘]) ‘‘‘ #取d中one对应的值,swaplevel交换里外层索引顺序 # print(d.swaplevel()[‘one‘]) ‘‘‘ d h 0 j 1 k 2 Name: a, dtype: int64 ‘‘‘ # print(b.loc[‘one‘].loc[‘h‘]) ‘‘‘ a 0 b 7 c one d h ‘‘‘ # print(b.swaplevel().loc[‘h‘]) ‘‘‘ a b c d c one 0 7 one h ‘‘‘
使用matplotlib呈现出店铺总数排名前10的国家
import pandas as pd import numpy as np import matplotlib.pyplot as plt import codecs file_path = ‘./starbucks_store_worldwide.csv‘ #读取文件,并解析内容,但是有些文件的格式不是utf-8,导致读取失败,无法继续 #转化成utf-8 file_path= codecs.open(file_path, ‘r‘,encoding= u‘utf-8‘,errors=‘ignore‘) df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) # #准备数据 # # data1 = df.groupby(by=‘Country‘).count()[‘Brand‘].sort_values(ascending=False)[:10] # # print(data1) # # x = data1.index # y = data1.values # # #画图 # plt.figure(figsize=(20,8),dpi = 80) # plt.bar(range(len(x)),y) # plt.xticks(range(len(x)),x) # # plt.savefig(‘./11店铺总数排名前10的国家.png‘) # plt.show()
使用matplotlib呈现出中国每个城市的店铺数量
import pandas as pd import numpy as np import matplotlib.pyplot as plt import codecs from matplotlib import font_manager # my_font = font_manager.FontProperties(fname="/Library/Fonts/Songti.ttc") file_path = ‘./starbucks_store_worldwide.csv‘ #读取文件,并解析内容,但是有些文件的格式不是utf-8,导致读取失败,无法继续 #转化成utf-8 file_path= codecs.open(file_path, ‘r‘,encoding= u‘utf-8‘,errors=‘ignore‘) df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) #使用matplotlib呈现出中国每个城市的店铺数量 df = df[df[‘Country‘] == ‘CN‘] print(df.head(1)) data1 = df.groupby(by=‘City‘).count()[‘Brand‘].sort_values(ascending=False)[:25] x = data1.index y = data1.values #画图 plt.figure(figsize=(20,8),dpi = 80) plt.bar(range(len(x)),y,width= 0.3,color = ‘y‘) plt.xticks(range(len(x)),x,fontproperties = my_font) # plt.savefig(‘./11中国每个城市的店铺数量.png‘) plt.show()
现有全球排名靠前的10000本书数据,完成以下内容
不同年份书的数量
import pandas as pd from matplotlib import pyplot as plt file_path = ‘./books.csv‘ df = pd.read_csv(file_path) # print(df.head(2)) # print(df.info()) #去掉缺失出版年份的书的数据 data1 = df[pd.notnull(df["original_publication_year"])] grouped = data1.groupby(by=‘original_publication_year‘).count()["title"] # print(grouped) x = grouped.index y = grouped.values #画图 plt.figure(figsize = (20,8),dpi = 80) plt.plot(range(len(x)),y) # print(len(x)) plt.xticks(list(range(len(x)))[::10],x[::10].astype(int),rotation = 45) plt.savefig(‘./12不同年份书数量.png‘) plt.show()
不同年份书的平均评分情况
import pandas as pd from matplotlib import pyplot as plt file_path = ‘./books.csv‘ df = pd.read_csv(file_path) # print(df.head(2)) # print(df.info()) data1 = df[pd.notnull(df["original_publication_year"])] #为什么groupby(by =data1[‘original_publication_year‘]里面与上面不同,by加了data1 #因为这个合并的数据是data1[‘average_rating‘],这里并没有年份数据, # 所以不能直接by = ‘original_publication_year‘ #要在data1中按年份合并 grouped = data1[‘average_rating‘].groupby(by =data1[‘original_publication_year‘]).mean() # print(grouped) x = grouped.index y = grouped.values #画图 plt.figure(figsize=(20,8),dpi=80) plt.plot(range(len(x)),y) # print(len(x)) plt.xticks(list(range(len(x)))[::10],x[::10].astype(int),rotation=45) plt.savefig(‘./12不同年份书平均评分.png‘) plt.show()
PeriodIndex-pm2.5
# coding=utf-8 ‘‘‘ 之前所学习的DatetimeIndex可以理解为时间戳 那么现在我们要学习的PeriodIndex可以理解为时间段 ‘‘‘ import pandas as pd import numpy as np from matplotlib import pyplot as plt #北京PM2.5随时间的变化情况 file_path = "./BeijingPM20100101_20151231.csv" df = pd.read_csv(file_path) # print(df.head()) # print(df.info()) #把分开的时间字符串通过PeriodIndex的方法转化为pandas的时间类型 period = pd.PeriodIndex(year = df[‘year‘],month = df[‘month‘], day = df[‘day‘],hour = df[‘hour‘],freq=‘H‘) # print(period) df[‘datatime‘] = period # print(df.head(10)) #把datatime设置为索引 df.set_index(‘datatime‘,inplace = True) #进行降采样 df = df.resample(‘7D‘).mean() #处理缺失数据 # print(df[‘PM_US Post‘]) ‘‘‘ datatime 2010-01-01 00:00 NaN 2010-01-01 01:00 NaN 2010-01-01 02:00 NaN 2010-01-01 03:00 NaN 2010-01-01 04:00 NaN ... 2015-12-31 19:00 133.0 2015-12-31 20:00 169.0 2015-12-31 21:00 203.0 2015-12-31 22:00 212.0 2015-12-31 23:00 235.0 Freq: H, Name: PM_US Post, Length: 52584, dtype: float64 ‘‘‘ #删除含有NaN值的行 data = df[‘PM_US Post‘] data_china = df[‘PM_Dongsi‘] x = data.index y = data.values x_china = data_china.index y_china = data_china.values plt.figure(figsize= (20,8),dpi=80) plt.plot(range(len(x)),y,label = ‘US_Post‘) plt.plot(range(len(x_china)),y_china,label = ‘CN_Post‘) print(len(x),len(x_china)) plt.xticks(range(0,len(x),10),list(x)[::10],rotation = 45) plt.legend(loc = ‘best‘) plt.savefig(‘./17北京PM2.5随时间的变化情况.png‘) plt.show()
原文:https://www.cnblogs.com/xcf20190825/p/12148889.html