首页 > 编程语言 > 详细

python全栈闯关--pandas

时间:2019-10-12 18:31:50      阅读:97      评论:0      收藏:0      [点我收藏+]

1、导入

import pandas as pd
import numpy as np

2、数据结构

1、Series

s = pd.Series([1, 2, 3, 4, 5, np.nan, 6, 7])
print(s)
# 0    1.0
# 1    2.0
# 2    3.0
# 3    4.0
# 4    5.0
# 5    NaN
# 6    6.0
# 7    7.0
# dtype: float64

2、DataFrame

dates = pd.date_range(20190101, periods=6)
# index行名,columns列名
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=[a, b, c, d])
print(列选取.center(50, -))
print(df)
#                    a         b         c         d
# 2019-01-01 -1.294464  0.706790 -0.164825 -0.237432
# 2019-01-02 -1.091822  0.824446  0.748465 -0.191267
# 2019-01-03 -0.755218  1.637604 -1.896371 -0.093815
# 2019-01-04 -2.610031 -0.705783 -1.247235 -1.398978
# 2019-01-05 -0.324550  1.014212  1.375527 -0.409117
# 2019-01-06 -0.512911  0.301417  1.227190  0.771551

# 按照列选择
print(列选取.center(50, -))
print(df[c])
# 2019-01-01   -0.164825
# 2019-01-02    0.748465
# 2019-01-03   -1.896371
# 2019-01-04   -1.247235
# 2019-01-05    1.375527
# 2019-01-06    1.227190
# Freq: D, Name: c, dtype: float64

 

3、创建特定数据的DataFrame

df_1 = pd.DataFrame({
    A: 1,  # 直接赋值,如果index数量大于1,默认此列按照这个值重复
    B: pd.Timestamp(20190930),  # 直接赋值Timestamp格式,多行,重复这个值
    C: pd.Series(2, index=list(range(4)), dtype=float),  # Series结构使用index,定义出整个函数的值
    D: pd.Categorical([1, 2, 3, 4]),
    E: [a, b, c, d],
    F: beer,
    G: [1, 5, 4, 4]
})
print(df_1)
#    A          B    C  D  E     F  G
# 0  1 2019-09-30  2.0  1  a  beer  1
# 1  1 2019-09-30  2.0  2  b  beer  5
# 2  1 2019-09-30  2.0  3  c  beer  4
# 3  1 2019-09-30  2.0  4  d  beer  4

 

4、DataFrame常用属性及排序

print(types.center(50, -))
print(df_1.dtypes)  # 按列列出每列的数据类型
# A             int64
# B    datetime64[ns]
# C           float64
# D          category
# E            object
# F            object
# G             int64
# dtype: object

print(index.center(50, -))
print(df_1.index, type(df_1.index))  # Int64Index([0, 1, 2, 3], dtype=‘int64‘)
# Int64Index([0, 1, 2, 3], dtype=‘int64‘) <class ‘pandas.core.indexes.numeric.Int64Index‘>

print(columns.center(50, -))
print(df_1.columns, type(df_1.columns))  # Index([‘A‘, ‘B‘, ‘C‘, ‘D‘, ‘E‘, ‘F‘], dtype=‘object‘)
# Index([‘A‘, ‘B‘, ‘C‘, ‘D‘, ‘E‘, ‘F‘, ‘G‘], dtype=‘object‘) <class ‘pandas.core.indexes.base.Index‘>

print(values.center(50, -))
print(df_1.values, type(df_1.values))  # 打印值,结果未numpy结构
# [[1 Timestamp(‘2019-09-30 00:00:00‘) 2.0 1 ‘a‘ ‘beer‘ 1]
#  [1 Timestamp(‘2019-09-30 00:00:00‘) 2.0 2 ‘b‘ ‘beer‘ 5]
#  [1 Timestamp(‘2019-09-30 00:00:00‘) 2.0 3 ‘c‘ ‘beer‘ 4]
#  [1 Timestamp(‘2019-09-30 00:00:00‘) 2.0 4 ‘d‘ ‘beer‘ 4]] <class ‘numpy.ndarray‘>


print(describe.center(50, -))
print(df_1.describe())  # 打印值,结果为numpy结构
#          A    C         G
# count  4.0  4.0  4.000000  计数
# mean   1.0  2.0  2.500000  平均数
# std    0.0  0.0  1.290994  方差
# min    1.0  2.0  1.000000  最小值
# 25%    1.0  2.0  1.750000  第一分位数
# 50%    1.0  2.0  2.500000  第二分位数
# 75%    1.0  2.0  3.250000  第三分位数
# max    1.0  2.0  4.000000  最大值

print(数据翻转.center(50, -))
print(df_1.T)  # 打印值,结果未numpy结构
#                0  ...                    3
# # A                    1  ...                    1
# # B  2019-09-30 00:00:00  ...  2019-09-30 00:00:00
# # C                    2  ...                    2
# # D                    1  ...                    4
# # E                    a  ...                    d
# # F                 beer  ...                 beer
# # G                    1  ...                    4
#
# [7 rows x 4 columns]

print(数据按照列排序.center(50, -))
# axis=1,按照行进行排序
# axis=0,按照列进行排序
# ascending=False倒序输出,True为正序输出
print(按照index排序.center(50, -))
print(df_1.sort_index(axis=0, ascending=False))
#    A          B    C  D  E     F  G
# 3  1 2019-09-30  2.0  4  d  beer  4
# 2  1 2019-09-30  2.0  3  c  beer  4
# 1  1 2019-09-30  2.0  2  b  beer  5
# 0  1 2019-09-30  2.0  1  a  beer  1

print(G列排序.center(50, -))
print(df_1.sort_values(by=G, ascending=False))  # 按照一列值排序
#    A          B    C  D  E     F  G
# 1  1 2019-09-30  2.0  2  b  beer  5
# 2  1 2019-09-30  2.0  3  c  beer  4
# 3  1 2019-09-30  2.0  4  d  beer  4
# 0  1 2019-09-30  2.0  1  a  beer  1

print(G,D列排序.center(50, -))
print(df_1.sort_values(by=[G, D], ascending=False))  # 按照多列值排序
#    A          B    C  D  E     F  G
# 1  1 2019-09-30  2.0  2  b  beer  5
# 3  1 2019-09-30  2.0  4  d  beer  4
# 2  1 2019-09-30  2.0  3  c  beer  4
# 0  1 2019-09-30  2.0  1  a  beer  1

print(数据按照行排序.center(50, -))
index = list(range(4))
col = [A, B, "C", D, E]
d_sort = pd.DataFrame(np.arange(20).reshape(4, 5), index=index, columns=col)
# print(d_sort)
print(d_sort.sort_values(by=[1, 2], axis=1, ascending=False))  # 按照行值排序
#     E   D   C   B   A
# 0   4   3   2   1   0
# 1   9   8   7   6   5
# 2  14  13  12  11  10
# 3  19  18  17  16  15

5、选择数据

index = [A, B, "C", D, E]
dates = pd.date_range(20191001, periods=10)
df = pd.DataFrame(np.random.randn(10, 5), index=dates, columns=index)
# print(df)
print(选择某列.center(50, -))
print(df[A])
# 2019-10-01   -0.595401
# 2019-10-02    1.264714
# 2019-10-03    1.179423
# 2019-10-04   -0.516471
# 2019-10-05    0.891850
# 2019-10-06   -0.011205
# 2019-10-07   -0.206089
# 2019-10-08    0.972745
# 2019-10-09   -0.135309
# 2019-10-10    1.590818
# Freq: D, Name: A, dtype: float64

print(切片选择.center(50, -))
print(df[0:3])  # 按照行数切片
print(df[2019-10-02:2019-10-05])  # 按照索引值进行切片
#              A         B         C         D         E
# 2019-10-01 -0.595401  0.337930  0.034220  1.472752 -0.555414
# 2019-10-02  1.264714  0.518856 -1.148349  1.674159 -0.473919
# 2019-10-03  1.179423  2.036095 -0.719042  1.607909  2.659472
#                    A         B         C         D         E
# 2019-10-02  1.264714  0.518856 -1.148349  1.674159 -0.473919
# 2019-10-03  1.179423  2.036095 -0.719042  1.607909  2.659472
# 2019-10-04 -0.516471  1.733509 -0.177231  0.260795 -0.106666
# 2019-10-05  0.891850  0.665301  0.013627 -1.346193  0.222099

# 按照行切片[0:3]值切片到了0-2行
# 按照值‘2019-10-02‘:‘2019-10-05‘切片,切到完整的日期范围

print(按照行精确选择.center(50, -)) print(df.loc[2019-10-02, [A, B]]) # 按照行精确选择列 # A 1.264714 # B 0.518856 # Name: 2019-10-02 00:00:00, dtype: float64 print(行号选择数据.center(50, -)) print(df.iloc[3, 1]) # 1.7335085248615345
# 行数从0开始计数
print(df.iloc[3:5, 0:2]) # 输出4到到5行的数据,1到2列的数据 # A B # 2019-10-04 -0.516471 1.733509 # 2019-10-05 0.891850 0.665301 # 切片从0开始计数,顾头部顾尾 print(混合选择.center(50, -)) print(df.ix[0:3, [B, C]]) # B C # 2019-10-01 0.337930 0.034220 # 2019-10-02 0.518856 -1.148349 # 2019-10-03 2.036095 -0.719042 print(条件选择.center(50, -)) print(df[df.A > 0]) # A B C D E # 2019-10-01 0.391314 0.647378 0.065032 -0.436882 -0.482698 # 2019-10-02 1.742555 0.374014 0.737914 1.708461 0.328336 # 2019-10-03 0.024506 -0.455824 -0.397145 1.523103 1.361226 # 2019-10-04 0.140041 -0.604164 -0.397656 -0.423711 -0.626598 # 2019-10-05 0.027898 0.159293 -1.000558 0.921370 -1.613052 # 2019-10-08 1.411249 -1.292006 0.140944 0.699647 -0.065080 # 2019-10-10 0.306495 0.590515 -0.524972 0.521179 -0.805736

 

python全栈闯关--pandas

原文:https://www.cnblogs.com/zxw-xxcsl/p/11662923.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!