Pandas与SQL语句对照
建立学生-课程数据库
表结构
|字段名|类型|备注|
|-|||
|sno|char|学号|
|sname|char|姓名|
|ssex|bool|性别|
|sage|int|年龄|
|sdept|char|所在系|
|字段名|类型|备注|
|-|||
|cno|char|课程号|
|cname|char|课程名称|
|cpno|char|先行课程|
|ccredit|int|学分|
|字段名|类型|备注|
|-|||
|sno|char|学号|
|cno|char|课程号|
|grade|int|成绩|
import pandas as pd
读取表数据内容
student=pd.read_csv('d:\\abc\\student.csv')
sc=pd.read_csv('d:\\abc\\sc.csv')
course=pd.read_csv('d:\\abc\\course.csv')
student
|
sno |
sname |
ssex |
sage |
sdept |
0 |
95001 |
李勇 |
True |
20 |
CS |
1 |
95002 |
刘晨 |
False |
19 |
IS |
2 |
95003 |
王敏 |
False |
18 |
MA |
3 |
95004 |
张立 |
True |
19 |
IS |
4 |
95005 |
刘云 |
False |
18 |
CS |
5 |
95006 |
张密码 |
False |
20 |
is |
6 |
95010 |
提供给 |
False |
21 |
NaN |
sc
|
sno |
cno |
grade |
0 |
95001 |
1 |
92.0 |
1 |
95001 |
2 |
95.0 |
2 |
95001 |
3 |
88.0 |
3 |
95001 |
4 |
60.0 |
4 |
95001 |
5 |
70.0 |
5 |
95001 |
6 |
80.0 |
6 |
95001 |
7 |
90.0 |
7 |
95002 |
1 |
95.0 |
8 |
95002 |
2 |
92.0 |
9 |
95003 |
2 |
85.0 |
10 |
95004 |
1 |
58.0 |
11 |
95004 |
2 |
85.0 |
12 |
95004 |
4 |
NaN |
13 |
95111 |
1 |
90.0 |
功能对照表
行列操作
列选择
# select sno,sname,sage from student
temp=student[['sno','sage','sname']]
temp.head(3)
|
sno |
sage |
sname |
0 |
95001 |
20 |
李勇 |
1 |
95002 |
19 |
刘晨 |
2 |
95003 |
18 |
王敏 |
增加一列
student['label']=student['sno'].astype("str") + '_'+ student['sname']
student.head()
|
sno |
sname |
ssex |
sage |
sdept |
label |
0 |
95001 |
李勇 |
True |
20 |
CS |
95001_李勇 |
1 |
95002 |
刘晨 |
False |
19 |
IS |
95002_刘晨 |
2 |
95003 |
王敏 |
False |
18 |
MA |
95003_王敏 |
3 |
95004 |
张立 |
True |
19 |
IS |
95004_张立 |
4 |
95005 |
刘云 |
False |
18 |
CS |
95005_刘云 |
列删除
del student['label'] #第一种方法
student.drop(columns=['ssex', 'sage'],inplace=True) # 第二种方法
student
|
sno |
sname |
sdept |
0 |
95001 |
李勇 |
CS |
1 |
95002 |
刘晨 |
IS |
2 |
95003 |
王敏 |
MA |
3 |
95004 |
张立 |
IS |
4 |
95005 |
刘云 |
CS |
5 |
95006 |
张密码 |
is |
6 |
95010 |
提供给 |
NaN |
行删除
# delete from student where sage>18
# inplace=True 表示在现有表上删除
# student.drop(student[student.sage>18].index,inplace=True)
# inplace=False ,不删除原表内容
# temp=student.drop(student[student.sage>18].index,inplace=False)
# delete from student where sage>18 and sdept='IS'
temp= student.drop(student[(student.sage>18) & (student.sdept=='IS')].index)
temp
|
sno |
sname |
ssex |
sage |
sdept |
0 |
95001 |
李勇 |
True |
20 |
CS |
2 |
95003 |
王敏 |
False |
18 |
MA |
4 |
95005 |
刘云 |
False |
18 |
CS |
5 |
95006 |
张密码 |
False |
20 |
is |
6 |
95010 |
提供给 |
False |
21 |
NaN |
数据修改
# update student set sage=19 where sdept='IS'
student.loc[student.sdept=='IS','sage']=19
student
|
sno |
sname |
ssex |
sage |
sdept |
0 |
95001 |
李勇 |
True |
20 |
CS |
1 |
95002 |
刘晨 |
False |
19 |
IS |
2 |
95003 |
王敏 |
False |
18 |
MA |
3 |
95004 |
张立 |
True |
19 |
IS |
4 |
95005 |
刘云 |
False |
18 |
CS |
5 |
95006 |
张密码 |
False |
20 |
is |
6 |
95010 |
提供给 |
False |
21 |
NaN |
条件筛选
# select sno,sage,sname from stuent where sage=20
temp=student[['sno','sage','sname']].query('sage==20') #第一种方法
temp=student[['sno','sage','sname']][student.sage==20] #第二种方法
temp=student[['sno','sage','sname']].loc[student.sage==20] #第三种方法
temp.head()
|
sno |
sage |
sname |
0 |
95001 |
20 |
李勇 |
5 |
95006 |
20 |
张密码 |
多条件选择
# select sno,sage,sname from student where sage=20 and sname='李勇'
temp=student[['sno','sage','sname']].query("sage==20 and sname=='李勇'") #第一种方法
temp.head()
|
sno |
sage |
sname |
0 |
95001 |
20 |
李勇 |
查找空值的内容
# select * from student where sdept is null
temp=student[student.sdept.isnull()]
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
6 |
95010 |
提供给 |
False |
21 |
NaN |
多值筛选
# select * from student where sdept in ('CS','MA')
temp=student[student.sdept.isin(['CS','MA'])]
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
0 |
95001 |
李勇 |
True |
20 |
CS |
2 |
95003 |
王敏 |
False |
18 |
MA |
4 |
95005 |
刘云 |
False |
18 |
CS |
区间查找
# select * from student where sage between 18 and 20
temp=student.query('19<=sage<=20')
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
0 |
95001 |
李勇 |
True |
20 |
CS |
1 |
95002 |
刘晨 |
False |
19 |
IS |
3 |
95004 |
张立 |
True |
19 |
IS |
5 |
95006 |
张密码 |
False |
20 |
is |
重复值消除
# select distinct sno from student
temp=sc['sno'].drop_duplicates() # 默认写法
temp.head()
0 95001
7 95002
9 95003
10 95004
13 95111
Name: sno, dtype: int64
# 查找 sage、sdept 重复的,如果发现,保留第一行
# 参数 keep:{‘first’, ‘last’, False}, 默认值 ‘first’;first: 保留第一次出现的重复行,删除后面的重复行。;last: 删除重复项,除了最后一次出现;False: 删除所有重复项。
# 参数 inplace:布尔值,默认为False,是否直接在原数据上删除重复项或删除重复项后返回副本。
temp=student.drop_duplicates(subset=['sage','sdept'],keep='first') # 带参数写法
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
0 |
95001 |
李勇 |
True |
20 |
CS |
1 |
95002 |
刘晨 |
False |
19 |
IS |
2 |
95003 |
王敏 |
False |
18 |
MA |
4 |
95005 |
刘云 |
False |
18 |
CS |
5 |
95006 |
张密码 |
False |
20 |
is |
数据返回量控制
返回三条记录
# select * from student limit3
student[0:3]
student.iloc[0:3]
# pandas以类似字典的方式来获取某一列的值,比如df[‘A’],这会得到df的A列。如果我们对某一行感兴趣呢?这个时候有两种方法,一种是iloc方法,另一种方法是loc方法。loc是指location的意思,iloc中的i是指integer。
# 也就是说loc是根据index来索引,比如下边的df定义了一个index,那么loc就根据这个index来索引对应的行。iloc并不是根据index来索引,而是根据行号来索引,行号从0开始,逐次加1。
|
sno |
sname |
ssex |
sage |
sdept |
0 |
95001 |
李勇 |
True |
20 |
CS |
1 |
95002 |
刘晨 |
False |
19 |
IS |
2 |
95003 |
王敏 |
False |
18 |
MA |
带有函数的例子
#select * from student where sage>=(select max(sage) from student)
temp=student[student.sage>=student.sage.max()]
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
6 |
95010 |
提供给 |
False |
21 |
NaN |
联合查询
#select cno,grade from student,sc where student.sno=sc.sno
tempa=student.query("sname=='李勇'")['sno'].values[0]
temp=sc.query("sno==%s" % tempa)[['cno','grade']]
temp.head()
|
cno |
grade |
0 |
1 |
92.0 |
1 |
2 |
95.0 |
2 |
3 |
88.0 |
3 |
4 |
60.0 |
4 |
5 |
70.0 |
# 等值连接
# select * from student,sc where student.sno=sc.sno
temp=student.merge(sc,on='sno')
temp.tail()
|
sno |
sname |
sdept |
cno |
grade |
8 |
95002 |
刘晨 |
IS |
2 |
92.0 |
9 |
95003 |
王敏 |
MA |
2 |
85.0 |
10 |
95004 |
张立 |
IS |
1 |
58.0 |
11 |
95004 |
张立 |
IS |
2 |
85.0 |
12 |
95004 |
张立 |
IS |
4 |
NaN |
pd.merge函数更多的内容,参考:https://blog.csdn.net/brucewong0516/article/details/82707492
# 外连接
# select * from student left join sc on student.sno=sc.sno
temp=pd.merge(student,sc,on='sno',how='outer') #第一种写法
temp=student.merge(sc,on='sno',how='outer') #第二种写法
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
cno |
grade |
0 |
95001 |
李勇 |
True |
20.0 |
CS |
1.0 |
92.0 |
1 |
95001 |
李勇 |
True |
20.0 |
CS |
2.0 |
95.0 |
2 |
95001 |
李勇 |
True |
20.0 |
CS |
3.0 |
88.0 |
3 |
95001 |
李勇 |
True |
20.0 |
CS |
4.0 |
60.0 |
4 |
95001 |
李勇 |
True |
20.0 |
CS |
5.0 |
70.0 |
聚合Aggregate
统计极值
# student.sage.mean() #均值
# student.sage.sum() #求和
# student.sage.count()#计数
# student.sage.max() #最大值
# 两种写法均可
# student.sage.min() #最小值
student['sage'].min()
18
常规统计
student.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
sno 7 non-null int64
sname 7 non-null object
ssex 7 non-null bool
sage 7 non-null int64
sdept 6 non-null object
dtypes: bool(1), int64(2), object(2)
memory usage: 359.0+ bytes
student.describe()
|
sno |
sage |
count |
7.000000 |
7.000000 |
mean |
95004.428571 |
19.285714 |
std |
2.992053 |
1.112697 |
min |
95001.000000 |
18.000000 |
25% |
95002.500000 |
18.500000 |
50% |
95004.000000 |
19.000000 |
75% |
95005.500000 |
20.000000 |
max |
95010.000000 |
21.000000 |
#类似于Counter功能
# select sage,count(sage) from student group by sage
student['sage'].value_counts()
20 2
19 2
18 2
21 1
Name: sage, dtype: int64
groupby
# select avg(sage) from student group by sdept
student.groupby('sdept')['sage'].mean()
sdept
CS 19
IS 19
MA 18
is 20
Name: sage, dtype: int64
排序
# select * from student order by sage desc
temp=student.sort_values(by=['sage'],ascending=False)
temp
|
sno |
sname |
ssex |
sage |
sdept |
6 |
95010 |
提供给 |
False |
21 |
NaN |
0 |
95001 |
李勇 |
True |
20 |
CS |
5 |
95006 |
张密码 |
False |
20 |
is |
1 |
95002 |
刘晨 |
False |
19 |
IS |
3 |
95004 |
张立 |
True |
19 |
IS |
2 |
95003 |
王敏 |
False |
18 |
MA |
4 |
95005 |
刘云 |
False |
18 |
CS |
参考 Pandas基本操作以及SQL对照 :https://www.jianshu.com/p/14af48479078
Pandas与SQL语句对照
原文:https://www.cnblogs.com/duoba/p/12427208.html